├── .DS_Store
├── examples
    ├── __init__.py
    ├── .DS_Store
    ├── firecrawl_automated_whitepaper_tracking
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_webhook.py
    │   │   └── test_semantic_filter.py
    │   ├── .DS_Store
    │   ├── test.png
    │   ├── images
    │   │   └── hf-daily-papers-github-logo.png
    │   ├── __init__.py
    │   ├── .gitignore
    │   ├── .env_example
    │   ├── pyproject.toml
    │   ├── paper-tracker.yml
    │   ├── category_prompt.py
    │   ├── logging_config.py
    │   ├── discord_notifications.py
    │   ├── hf_white_paper_tracker.py
    │   ├── semantic_filter.py
    │   ├── x_post_v3.py
    │   ├── x_post.py
    │   ├── firecrawl_crawl_extract.py
    │   ├── supabase_db.py
    │   ├── README.md
    │   └── x_post_v2.py
    └── .gitignore
├── .github
    ├── .DS_Store
    └── workflows
    │   └── paper-tracker.yml
├── images
    └── firecrawl-quickstarts-github-cover.png
├── .gitignore
├── README.md
├── llm_extract_tutorial.ipynb
├── crawl_and_extract_with_openai_o1.ipynb
├── crawl_and_extract_with_xai_grok.ipynb
└── claude_researcher_with_map.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/.DS_Store


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | __doc__ = """Empty file to mark directory as Python package"""
2 | 


--------------------------------------------------------------------------------
/.github/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/.github/.DS_Store


--------------------------------------------------------------------------------
/examples/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/.DS_Store


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/tests/__init__.py:
--------------------------------------------------------------------------------
1 | __doc__ = """Empty file to mark directory as Python package"""
2 | 


--------------------------------------------------------------------------------
/images/firecrawl-quickstarts-github-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/images/firecrawl-quickstarts-github-cover.png


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/firecrawl_automated_whitepaper_tracking/.DS_Store


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/firecrawl_automated_whitepaper_tracking/test.png


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/images/hf-daily-papers-github-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/firecrawl_automated_whitepaper_tracking/images/hf-daily-papers-github-logo.png


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python cache files
 2 | __pycache__/
 3 | *.pyc
 4 | *.pyo
 5 | *.pyd
 6 | 
 7 | # Logs
 8 | logs/
 9 | *.log
10 | **/logs/
11 | **/logs/**
12 | 
13 | # macOS system files
14 | .DS_Store
15 | .DS_Store?
16 | ._*
17 | 
18 | # IDE
19 | .idea/
20 | *.iml


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/__init__.py:
--------------------------------------------------------------------------------
1 | __doc__ = """
2 | Empty __init__.py file to ensure this directory is recognized as a package/module,
3 | allowing imports like "from category_prompt import DESIRED_CATEGORY" in discord_notifications.py.
4 | """


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/.gitignore:
--------------------------------------------------------------------------------
 1 | .venv
 2 | .env
 3 | 
 4 | # PyCharm
 5 | .idea/
 6 | *.iml
 7 | *.iws
 8 | *.ipr
 9 | *.xml
10 | .idea_modules/
11 | *.pyc
12 | __pycache__/
13 | *.pyo
14 | *.pyd
15 | .Python
16 | .python-version
17 | 
18 | # GitHub Actions
19 | .github/workflows/.env
20 | 
21 | # Temporary files
22 | *.log
23 | *.json
24 | 
25 | # Add this near the top, before the *.pyc line
26 | !__init__.py
27 | 
28 | # Database migrations
29 | migrate_to_v1.py
30 | migrate_to_v2.py
31 | migrate_to_v3.py
32 | revert_to_v2.py
33 | migrate_make_dates_nullable.py
34 | 
35 | # Logs directory - comprehensive exclusion
36 | logs/
37 | *.log
38 | examples/logs/
39 | examples/logs/**
40 | **/logs/
41 | **/logs/**


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/.env_example:
--------------------------------------------------------------------------------
 1 | # Firecrawl API credentials
 2 | FIRECRAWL_API_KEY=your_firecrawl_api_key
 3 | 
 4 | # Database configuration
 5 | POSTGRES_URL=postgresql://user:password@host:port/database
 6 | 
 7 | # Discord webhook for notifications
 8 | DISCORD_WEBHOOK_URL=your_discord_webhook_url
 9 | 
10 | # OpenAI API credentials
11 | OPENAI_API_KEY=your_openai_api_key
12 | 
13 | # X (Twitter) API credentials
14 | X_API_KEY=your_x_api_key
15 | X_API_SECRET=your_x_api_secret
16 | X_OAUTH2_CLIENT_ID=your_oauth2_client_id
17 | X_OAUTH2_CLIENT_SECRET=your_oauth2_client_secret
18 | 
19 | # The following X OAuth tokens will be auto-generated after local authorization
20 | # X_ACCESS_TOKEN=
21 | # X_REFRESH_TOKEN=
22 | # X_TOKEN_EXPIRES_IN=
23 | # X_TOKEN_SCOPE= 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "firecrawl-automated-whitepaper-tracking"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["alexfazio <alessandro.fazio@me.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | package-mode = false
 9 | 
10 | [tool.poetry.dependencies]
11 | python = "3.10.15"
12 | firecrawl-py = "1.6.8"
13 | pydantic = "2.10.4"
14 | psycopg2-binary = "2.9.10"
15 | python-dotenv = "1.0.1"
16 | sqlalchemy = "*"
17 | pandas = "2.2.3"
18 | plotly = "5.24.1"
19 | aiohttp = "3.11.11"
20 | altair = "5.5.0"
21 | openai = ">=0.28.0"
22 | pytz = "2024.1"
23 | requests-oauthlib = "^2.0.0"
24 | requests = "^2.32.3"
25 | playwright = "^1.49.1"
26 | 
27 | [build-system]
28 | requires = ["poetry-core"]
29 | build-backend = "poetry.core.masonry.api"
30 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/tests/test_webhook.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """Module for testing the Discord webhook."""
 2 | 
 3 | import os
 4 | from dotenv import load_dotenv
 5 | import requests
 6 | import json
 7 | 
 8 | def test_discord_webhook():
 9 |     """Test the Discord webhook configuration by sending a test message."""
10 |     # Load environment variables
11 |     load_dotenv()
12 |     
13 |     webhook_url = os.getenv('DISCORD_WEBHOOK_URL')
14 |     if not webhook_url:
15 |         raise ValueError("DISCORD_WEBHOOK_URL not found in .env file")
16 | 
17 |     # Test message
18 |     message = {
19 |         "content": "🎉 Webhook test successful! Your Discord notifications are working.",
20 |         "embeds": [{
21 |             "title": "Test Embed",
22 |             "description": "This is a test message to verify the webhook configuration.",
23 |             "color": 5814783,  # A nice blue color
24 |             "fields": [
25 |                 {
26 |                     "name": "Status",
27 |                     "value": "✅ Connected",
28 |                     "inline": True
29 |                 }
30 |             ]
31 |         }]
32 |     }
33 | 
34 |     try:
35 |         response = requests.post(
36 |             webhook_url,
37 |             data=json.dumps(message),
38 |             headers={'Content-Type': 'application/json'},
39 |             timeout=10  # 10 seconds timeout
40 |         )
41 |         response.raise_for_status()
42 |         print("Test message sent successfully!")
43 |         return True
44 |     except requests.exceptions.RequestException as e:
45 |         print(f"Error sending test message: {e}")
46 |         return False
47 | 
48 | if __name__ == "__main__":
49 |     test_discord_webhook()
50 | 


--------------------------------------------------------------------------------
/.github/workflows/paper-tracker.yml:
--------------------------------------------------------------------------------
 1 | name: Paper Tracker
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 */12 * * *'
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   track-papers:
10 |     runs-on: ubuntu-latest
11 |     defaults:
12 |       run:
13 |         working-directory: examples/firecrawl_automated_whitepaper_tracking
14 | 
15 |     steps:
16 |       - name: Check out repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: '3.10.15'
23 |           cache: 'pip'
24 | 
25 |       - name: Install Poetry
26 |         run: |
27 |           curl -sSL https://install.python-poetry.org | python3 -
28 |           poetry --version
29 | 
30 |       - name: Install dependencies
31 |         run: |
32 |           poetry install
33 | 
34 |       - name: Test Database Connection (Optional)
35 |         env:
36 |           POSTGRES_URL: ${{ secrets.POSTGRES_URL }}  # <-- Must be port 6543 now
37 |         run: |
38 |           echo "Testing database connection with psql..."
39 |           sudo apt-get update && sudo apt-get install -y postgresql-client
40 |           psql "$POSTGRES_URL" -c '\conninfo'
41 | 
42 |       - name: Run paper tracker
43 |         env:
44 |           DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
45 |           FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }}
46 |           POSTGRES_URL: ${{ secrets.POSTGRES_URL }}  # <-- Also port 6543 here
47 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
48 |         run: |
49 |           poetry run python hf_white_paper_tracker.py
50 | 
51 |       - name: Notify on Failure
52 |         if: failure()
53 |         env:
54 |           DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
55 |         run: |
56 |           curl -H "Content-Type: application/json" -X POST \
57 |                -d '{"content":"⚠️ Paper Tracker workflow failed! Please check the GitHub Actions logs."}' \
58 |                $DISCORD_WEBHOOK_URL
59 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/paper-tracker.yml:
--------------------------------------------------------------------------------
 1 | name: Paper Tracker
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 */12 * * *'
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   track-papers:
10 |     runs-on: ubuntu-latest
11 |     defaults:
12 |       run:
13 |         working-directory: examples/firecrawl_automated_whitepaper_tracking
14 | 
15 |     steps:
16 |       - name: Check out repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: '3.10'
23 |           cache: 'pip'
24 | 
25 |       - name: Install Poetry
26 |         run: |
27 |           curl -sSL https://install.python-poetry.org | python3 -
28 |           poetry --version
29 | 
30 |       - name: Install dependencies
31 |         run: |
32 |           poetry install
33 | 
34 |       - name: Test Database Connection (Optional)
35 |         env:
36 |           POSTGRES_URL: ${{ secrets.POSTGRES_URL }}  # <-- Must be port 6543 now
37 |         run: |
38 |           echo "Testing database connection with psql..."
39 |           sudo apt-get update && sudo apt-get install -y postgresql-client
40 |           psql "$POSTGRES_URL" -c '\conninfo'
41 | 
42 |       - name: Run paper tracker
43 |         env:
44 |           DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
45 |           FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }}
46 |           POSTGRES_URL: ${{ secrets.POSTGRES_URL }}  # <-- Also port 6543 here
47 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
48 |         run: |
49 |           poetry run python hf_white_paper_tracker.py
50 | 
51 |       - name: Notify on Failure
52 |         if: failure()
53 |         env:
54 |           DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
55 |         run: |
56 |           curl -H "Content-Type: application/json" -X POST \
57 |                -d '{"content":"⚠️ Paper Tracker workflow failed! Please check the GitHub Actions logs."}' \
58 |                $DISCORD_WEBHOOK_URL
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  3 | 
  4 | # User-specific stuff
  5 | .idea/**/workspace.xml
  6 | .idea/**/tasks.xml
  7 | .idea/**/usage.statistics.xml
  8 | .idea/**/dictionaries
  9 | .idea/**/shelf
 10 | 
 11 | # AWS User-specific
 12 | .idea/**/aws.xml
 13 | 
 14 | # Generated files
 15 | .idea/**/contentModel.xml
 16 | 
 17 | # Sensitive or high-churn files
 18 | .idea/**/dataSources/
 19 | .idea/**/dataSources.ids
 20 | .idea/**/dataSources.local.xml
 21 | .idea/**/sqlDataSources.xml
 22 | .idea/**/dynamic.xml
 23 | .idea/**/uiDesigner.xml
 24 | .idea/**/dbnavigator.xml
 25 | 
 26 | # Gradle
 27 | .idea/**/gradle.xml
 28 | .idea/**/libraries
 29 | 
 30 | # Gradle and Maven with auto-import
 31 | # When using Gradle or Maven with auto-import, you should exclude module files,
 32 | # since they will be recreated, and may cause churn.  Uncomment if using
 33 | # auto-import.
 34 | # .idea/artifacts
 35 | # .idea/compiler.xml
 36 | # .idea/jarRepositories.xml
 37 | # .idea/modules.xml
 38 | # .idea/*.iml
 39 | # .idea/modules
 40 | # *.iml
 41 | # *.ipr
 42 | 
 43 | # CMake
 44 | cmake-build-*/
 45 | 
 46 | # Mongo Explorer plugin
 47 | .idea/**/mongoSettings.xml
 48 | 
 49 | # File-based project format
 50 | *.iws
 51 | 
 52 | # IntelliJ
 53 | out/
 54 | 
 55 | # mpeltonen/sbt-idea plugin
 56 | .idea_modules/
 57 | 
 58 | # JIRA plugin
 59 | atlassian-ide-plugin.xml
 60 | 
 61 | # Cursive Clojure plugin
 62 | .idea/replstate.xml
 63 | 
 64 | # SonarLint plugin
 65 | .idea/sonarlint/
 66 | 
 67 | # Crashlytics plugin (for Android Studio and IntelliJ)
 68 | com_crashlytics_export_strings.xml
 69 | crashlytics.properties
 70 | crashlytics-build.properties
 71 | fabric.properties
 72 | 
 73 | # Editor-based Rest Client
 74 | .idea/httpRequests
 75 | 
 76 | # Android studio 3.1+ serialized cache file
 77 | .idea/caches/build_file_checksums.ser
 78 | 
 79 | # Files for environment variables, excluding examples
 80 | *.env
 81 | !.env.example
 82 | 
 83 | # macOS system files
 84 | .DS_Store
 85 | .DS_Store?
 86 | ._*
 87 | 
 88 | # PyCharm
 89 | .idea/
 90 | *.iml
 91 | *.iws
 92 | *.ipr
 93 | *.xml
 94 | .idea_modules/
 95 | *.pyc
 96 | __pycache__/
 97 | *.pyo
 98 | *.pyd
 99 | .Python
100 | .python-version
101 | 
102 | # VSCode
103 | .vscode/*
104 | !.vscode/extensions.json
105 | !.vscode/launch.json
106 | !.vscode/tasks.json
107 | *.code-workspace
108 | .history/
109 | .settings/
110 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/category_prompt.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This file defines the DESIRED_CATEGORY as a single, long string
 3 | that describes the category of interest.
 4 | """
 5 | 
 6 | DESIRED_CATEGORY = """
 7 | **Definition of “AI Agents”**
 8 | An "AI Agent" is any system in which a large language model (LLM):
 9 |     1. Maintains Dynamic Control over how tasks are accomplished, including
10 |        which tools or APIs are used and in what sequence.
11 |     2. Plans, Reasons, and Adapts its approach based on user goals and
12 |        feedback from its environment (e.g., tool outputs, code execution,
13 |        external data).
14 |     3. Acts Autonomously or Semi-Autonomously in open-ended or complex tasks
15 |        that cannot be fully decomposed in advance.
16 |     4. Demonstrates Decision-Making beyond hardcoded or strictly
17 |        human-defined workflow paths, such as deciding what to do next at
18 |        each step (versus executing a single, fixed script).
19 | 
20 | **Core Criterion for Classification**
21 | A white paper belongs to the “AI Agents” category if its primary focus
22 | describes, evaluates, measures, or demonstrates LLM-based systems that
23 | exhibit or aim to exhibit one or more of the above qualities. This includes
24 | systems that:
25 |     • Show Partial, Incremental, or Full Autonomy in real-world or
26 |       simulated tasks.
27 |     • Employ LLMs to Dynamically Decide how to use tools (e.g., web
28 |       browsing, code writing, system commands).
29 |     • Investigate, Benchmark, or Compare the performance of such agentic
30 |       systems, even if only a subset of tasks is completed autonomously.
31 |     • Provide Frameworks for Building or Testing agentic capabilities in
32 |       LLMs (e.g., multi-step planning, chain-of-thought reasoning,
33 |       environment/tool usage).
34 | 
35 | **Clarifications to Prevent Underclassification**
36 |     • Partial Autonomy Counts: Papers need not demonstrate 100% autonomous
37 |       task completion. Even if an LLM handles only a fraction of tasks
38 |       without human intervention, it can still qualify if the system’s
39 |       goal or design involves adaptive or autonomous capabilities.
40 |     • Research or Benchmarking is Included: Papers that focus on measuring,
41 |       experimenting with, or benchmarking LLM agents should be classified
42 |       as “AI Agents” if they revolve around agentic behavior, even if the
43 |       research finds current systems are limited or only partially
44 |       successful.
45 |     • Use of Tools or Environment: If the paper describes LLMs selecting
46 |       and executing code, commands, or API calls at their own discretion
47 |       (i.e., not merely a single-step prompt for code generation), it
48 |       likely falls under agentic systems.
49 |     • Evaluation of Agent Performance: Studies that assess the
50 |       effectiveness, reliability, or scalability of AI agents in performing
51 |       tasks should be included if they address the agent’s ability to
52 |       autonomously manage and execute tasks.
53 |     • Integration with External Systems: Papers that explore how AI agents
54 |       interact with external systems, databases, or APIs to accomplish
55 |       tasks should be considered relevant.
56 | 
57 | **Exclusion Criterion**
58 | A paper should not be classified under “AI Agents” if it only:
59 |     • Discusses Static or Single-Step LLM Prompts that generate answers,
60 |       translations, or content without autonomy or iterative
61 |       decision-making.
62 |     • Describes Purely Human-Orchestrated Pipelines where the LLM’s role is
63 |       strictly predefined at each step (no dynamic path-finding, tool
64 |       selection, or open-ended planning).
65 |     • Focuses on General LLM Usage (e.g., chatbots, Q&A systems) without
66 |       discussing autonomy, adaptive behavior, or iterative tool usage.
67 | 
68 | **Likely Categories for Agentic Systems Papers**
69 | Based on Anthropic’s blog post, these arXiv categories are the most likely
70 | homes for papers on agentic LLM systems:
71 |     • Multiagent Systems (cs.MA) – Most directly relevant
72 |     • Artificial Intelligence (cs.AI)
73 |     • Computation and Language (cs.CL)
74 |     • Machine Learning (cs.LG)
75 |     • Human-Computer Interaction (cs.HC)
76 |     • Software Engineering (cs.SE)
77 | """


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/logging_config.py:
--------------------------------------------------------------------------------
  1 | """Module for configuring logging across the application."""
  2 | 
  3 | import logging
  4 | from logging.handlers import RotatingFileHandler
  5 | from datetime import datetime
  6 | from functools import wraps  # Needed for the decorator
  7 | from pathlib import Path
  8 | 
  9 | def setup_base_logging(
 10 |     logger_name: str,
 11 |     log_file: str = None,
 12 |     log_level: int = logging.INFO,
 13 |     format_string: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 14 | ) -> logging.Logger:
 15 |     """Configure base logging with both file and console handlers.
 16 |     
 17 |     Args:
 18 |         logger_name (str): Name of the logger to configure
 19 |         log_file (str, optional): Path to log file. If None, only console logging is used
 20 |         log_level (int, optional): Logging level. Defaults to INFO
 21 |         format_string (str, optional): Format string for log messages
 22 |         
 23 |     Returns:
 24 |         logging.Logger: Configured logger instance
 25 |     """
 26 |     # Create logs directory relative to the examples directory
 27 |     logs_dir = Path(__file__).parent.parent / 'logs'
 28 |     logs_dir.mkdir(parents=True, exist_ok=True)
 29 |     
 30 |     logger = logging.getLogger(logger_name)
 31 |     logger.setLevel(log_level)
 32 |     
 33 |     # Clear any existing handlers
 34 |     if logger.hasHandlers():
 35 |         logger.handlers.clear()
 36 |     
 37 |     # Create formatter
 38 |     formatter = logging.Formatter(format_string)
 39 |     
 40 |     # Create and configure console handler
 41 |     console_handler = logging.StreamHandler()
 42 |     console_handler.setLevel(log_level)
 43 |     console_handler.setFormatter(formatter)
 44 |     logger.addHandler(console_handler)
 45 |     
 46 |     # Add file handler if log_file is specified
 47 |     if log_file:
 48 |         log_path = logs_dir / log_file
 49 |         file_handler = RotatingFileHandler(
 50 |             log_path,
 51 |             maxBytes=5*1024*1024,  # 5MB
 52 |             backupCount=5
 53 |         )
 54 |         file_handler.setLevel(log_level)
 55 |         file_handler.setFormatter(formatter)
 56 |         logger.addHandler(file_handler)
 57 |     
 58 |     return logger
 59 | 
 60 | def setup_crawler_logging() -> logging.Logger:
 61 |     """
 62 |     Configure logging for the crawler with file output in the specified logs directory.
 63 |     
 64 |     Returns:
 65 |         logging.Logger: Configured logger instance
 66 |     """
 67 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 68 |     return setup_base_logging(
 69 |         logger_name="hf_paper_tracker",
 70 |         log_file=f"paper_tracker_{timestamp}.log"
 71 |     )
 72 | 
 73 | def setup_semantic_filter_logging() -> logging.Logger:
 74 |     """Configure logging specifically for the semantic filter module."""
 75 |     timestamp = datetime.now().strftime("%Y%m%d")
 76 |     return setup_base_logging(
 77 |         logger_name='semantic_filter',
 78 |         log_file=f'semantic_filter_{timestamp}.log',
 79 |         format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
 80 |     )
 81 | 
 82 | def setup_database_logging() -> logging.Logger:
 83 |     """Configure logging specifically for the database module."""
 84 |     return setup_base_logging(
 85 |         logger_name='database',
 86 |         log_file='database.log'
 87 |     )
 88 | 
 89 | def log_function_call(func):
 90 |     """Decorator to log entry and exit of functions."""
 91 |     @wraps(func)
 92 |     def wrapper(*args, **kwargs):
 93 |         logger = logging.getLogger('semantic_filter')  # or whichever logger name you prefer
 94 |         logger.info("Entering %s", func.__name__)
 95 |         try:
 96 |             result = func(*args, **kwargs)
 97 |             logger.info("Exiting %s successfully", func.__name__)
 98 |             return result
 99 |         except Exception as e:
100 |             logger.error("Error in %s: %s", func.__name__, str(e))
101 |             raise
102 |     return wrapper
103 | 
104 | # TODO: change logging style to display logging level before module name
105 | # e.g. 2024-12-24 12:05:18,885 - INFO - hf_paper_tracker - Extracting paper details from
106 | # instead of 2024-12-24 12:05:18,885 - hf_paper_tracker - INFO - Extracting paper details from
107 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/discord_notifications.py:
--------------------------------------------------------------------------------
  1 | """Module for sending notifications about new research papers to Discord."""
  2 | 
  3 | import os
  4 | import asyncio
  5 | import aiohttp
  6 | from dotenv import load_dotenv
  7 | from logging_config import setup_base_logging, log_function_call
  8 | 
  9 | # Configure logging using the centralized configuration
 10 | logger = setup_base_logging(
 11 |     logger_name="discord_notifier",
 12 |     log_file="discord_notifications.log",
 13 |     format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
 14 | )
 15 | 
 16 | load_dotenv()
 17 | 
 18 | @log_function_call
 19 | async def send_paper_notification(
 20 |     paper_title: str,
 21 |     authors: list,
 22 |     abstract: str,
 23 |     upvotes: int,
 24 |     comments: int,
 25 |     url: str,
 26 |     pdf_url: str = None,
 27 |     arxiv_url: str = None,
 28 |     github_url: str = None
 29 | ):
 30 |     """Send a new paper notification to Discord"""
 31 |     logger.info(f"Preparing notification for paper: {paper_title}")
 32 |     
 33 |     # Create links section
 34 |     links = []
 35 |     if pdf_url:
 36 |         links.append(f"[📄 PDF]({pdf_url})")
 37 |         logger.debug("Added PDF link to notification")
 38 |     if arxiv_url:
 39 |         links.append(f"[📝 arXiv]({arxiv_url})")
 40 |         logger.debug("Added arXiv link to notification")
 41 |     if github_url:
 42 |         links.append(f"[💻 GitHub]({github_url})")
 43 |         logger.debug("Added GitHub link to notification")
 44 |     
 45 |     # Truncate abstract if needed
 46 |     truncated_abstract = abstract[:500] + ('...' if len(abstract) > 500 else '')
 47 |     logger.debug(f"Abstract truncated from {len(abstract)} to {len(truncated_abstract)} chars")
 48 |     
 49 |     message = {
 50 |         "embeds": [
 51 |             {
 52 |                 "title": "📚 New Paper Published!",
 53 |                 "description": f"**{paper_title}**\n\n"
 54 |                 f"**Authors:** {', '.join(authors)}\n\n"
 55 |                 f"**Abstract:**\n{truncated_abstract}\n\n"
 56 |                 f"**Stats:** ⬆️ {upvotes} | 💬 {comments}\n\n"
 57 |                 f"**Links:**\n{' • '.join(links)}\n\n"
 58 |                 f"[View on HuggingFace]({url})",
 59 |                 "color": 5814783,  # HF's purple color
 60 |             }
 61 |         ]
 62 |     }
 63 | 
 64 |     webhook_url = os.getenv("DISCORD_WEBHOOK_URL")
 65 |     if not webhook_url:
 66 |         logger.error("Discord webhook URL not found in environment variables")
 67 |         return
 68 | 
 69 |     try:
 70 |         logger.info("Sending notification to Discord webhook")
 71 |         async with aiohttp.ClientSession() as session:
 72 |             async with session.post(webhook_url, json=message) as response:
 73 |                 if response.status == 204:  # Discord returns 204 on success
 74 |                     logger.info("Successfully sent Discord notification")
 75 |                 else:
 76 |                     response_text = await response.text()
 77 |                     logger.error(f"Discord API returned status {response.status}: {response_text}")
 78 |                     
 79 |     except aiohttp.ClientError as e:
 80 |         logger.error(f"Network error sending Discord notification: {str(e)}", exc_info=True)
 81 |     except Exception as e:
 82 |         logger.error(f"Unexpected error sending Discord notification: {str(e)}", exc_info=True)
 83 | 
 84 | if __name__ == "__main__":
 85 |     logger.info("Starting Discord notification test")
 86 |     try:
 87 |         # Test notification
 88 |         asyncio.run(
 89 |             send_paper_notification(
 90 |                 paper_title="Test Paper Title",
 91 |                 authors=["Author 1", "Author 2"],
 92 |                 abstract="This is a test abstract for the paper notification system.",
 93 |                 upvotes=10,
 94 |                 comments=5,
 95 |                 url="https://huggingface.co/papers/test",
 96 |                 pdf_url="https://example.com/test.pdf",
 97 |                 arxiv_url="https://arxiv.org/abs/test",
 98 |                 github_url="https://github.com/test/repo"
 99 |             )
100 |         )
101 |         logger.info("Test notification completed")
102 |     except Exception as e:
103 |         logger.error("Test notification failed:", exc_info=True)
104 | 
105 | # TODO: implement discord button for feedback about relevancy of notifications, 
106 | # this must be fed back into the database for subsequent refinement of prompts
107 | 
108 | # TODO: implement admin-only error notifications in Discord - errors should only be 
109 | # visible to channel administrators to avoid cluttering the main feed
110 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/hf_white_paper_tracker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Entry point module for the Hugging Face White Paper Tracker.
  4 | Handles command line arguments and initiates the paper tracking process.
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import argparse
 10 | import asyncio
 11 | import requests
 12 | from typing import Optional
 13 | from sqlalchemy.exc import SQLAlchemyError
 14 | 
 15 | # Add project root to Python path
 16 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
 17 | sys.path.insert(0, project_root)
 18 | 
 19 | # Now we can import our modules
 20 | from examples.firecrawl_automated_whitepaper_tracking.firecrawl_crawl_extract import (
 21 |     extract_paper_urls,
 22 |     process_paper_batch,
 23 |     get_todays_papers_url
 24 | )
 25 | from examples.firecrawl_automated_whitepaper_tracking.supabase_db import Database
 26 | from examples.firecrawl_automated_whitepaper_tracking.logging_config import setup_crawler_logging
 27 | 
 28 | # Initialize logger
 29 | logger = setup_crawler_logging()
 30 | 
 31 | def verify_database_connection(db: Database) -> tuple[bool, str]:
 32 |     """Test database connection and return status."""
 33 |     logger.debug("Verifying database connection...")
 34 |     try:
 35 |         db.get_all_papers()
 36 |         return True, "Database connection successful"
 37 |     except SQLAlchemyError as e:
 38 |         logger.error("Database connection failed: %s", str(e))
 39 |         return False, f"Database connection failed: {str(e)}"
 40 | 
 41 | def verify_database_version(db: Database) -> tuple[bool, str]:
 42 |     """Verify database schema version matches required version."""
 43 |     logger.debug("Verifying database schema version...")
 44 |     try:
 45 |         session = db.session_factory()
 46 |         db._check_schema_version()
 47 |         session.close()
 48 |         return True, f"Database schema version verified"
 49 |     except RuntimeError as e:
 50 |         logger.error("Database version check failed: %s", str(e))
 51 |         return False, str(e)
 52 | 
 53 | def perform_startup_checks(db: Database) -> None:
 54 |     """Perform all startup checks before proceeding."""
 55 |     # Database connection check
 56 |     connection_ok, connection_msg = verify_database_connection(db)
 57 |     logger.info(connection_msg)
 58 |     if not connection_ok:
 59 |         raise RuntimeError(connection_msg)
 60 | 
 61 |     # Database version check
 62 |     version_ok, version_msg = verify_database_version(db)
 63 |     logger.info(version_msg)
 64 |     if not version_ok:
 65 |         raise RuntimeError(version_msg)
 66 | 
 67 | def run_paper_tracker(url: Optional[str] = None, date: Optional[str] = None) -> None:
 68 |     """
 69 |     Main function to run the paper tracking process.
 70 |     
 71 |     Args:
 72 |         url (Optional[str]): Full URL to crawl (e.g., https://huggingface.co/papers?date=2024-12-19)
 73 |         date (Optional[str]): Date in YYYY-MM-DD format (e.g., 2024-12-19)
 74 |     """
 75 |     # Initialize database first
 76 |     db = Database(os.getenv("POSTGRES_URL"))
 77 |     logger.info("Database connection initialized")
 78 |     
 79 |     # Perform startup checks before proceeding
 80 |     perform_startup_checks(db)
 81 |     
 82 |     # Determine which URL to use
 83 |     if url:
 84 |         papers_url = url
 85 |         logger.info("Using provided full URL: %s", papers_url)
 86 |     elif date:
 87 |         papers_url = f"https://huggingface.co/papers?date={date}"
 88 |         logger.info("Using URL for specified date: %s", papers_url)
 89 |     else:
 90 |         papers_url = get_todays_papers_url()
 91 |         logger.info("Using today's papers URL: %s", papers_url)
 92 |     
 93 |     urls = extract_paper_urls(papers_url)
 94 |     logger.info("Found %d papers to process", len(urls))
 95 |     
 96 |     try:
 97 |         asyncio.run(process_paper_batch(urls, db))
 98 |     except (SQLAlchemyError, requests.RequestException, ValueError) as e:
 99 |         logger.error("Critical error in main process: %s", str(e), exc_info=True)
100 |         raise
101 | 
102 | if __name__ == "__main__":
103 |     # Set up argument parser
104 |     parser = argparse.ArgumentParser(description='Crawl and extract papers from HuggingFace.')
105 |     parser.add_argument('--url', type=str, 
106 |                        help='Full URL to crawl (e.g., https://huggingface.co/papers?date=2024-12-19)')
107 |     parser.add_argument('--date', type=str, 
108 |                        help='Date in YYYY-MM-DD format (e.g., 2024-12-19)')
109 |     
110 |     args = parser.parse_args()
111 |     run_paper_tracker(url=args.url, date=args.date)
112 | 
113 | # TODO: Include a Bluesky API call to publish the paper's posts to Bluesky. This will require a new
114 | # llm flow to generate the post content and a new function to send the post to Bluesky.
115 | # TODO: test db connection and add check for db versoin matching supabase_db.py before running any modules


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/semantic_filter.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """Module for semantic filtering of research papers using OpenAI's API."""
  2 | 
  3 | import os
  4 | import json
  5 | 
  6 | from json import JSONDecodeError
  7 | from pydantic import BaseModel, ValidationError
  8 | from dotenv import load_dotenv
  9 | 
 10 | import openai
 11 | from logging_config import setup_semantic_filter_logging, log_function_call
 12 | from category_prompt import DESIRED_CATEGORY
 13 | 
 14 | # Load environment variables
 15 | load_dotenv()
 16 | 
 17 | # Configure OpenAI API key
 18 | openai.api_key = os.getenv('OPENAI_API_KEY')
 19 | 
 20 | # Configure logging using centralized configuration
 21 | logger = setup_semantic_filter_logging()
 22 | logger.info("Using OpenAI version: %s", openai.__version__)
 23 | 
 24 | client = openai.OpenAI()
 25 | 
 26 | class CategoryMatch(BaseModel):
 27 |     """
 28 |     Pydantic model for paper category classification results.
 29 |     We validate the structured output from the model here, ensuring
 30 |     it has a boolean 'belongs_to_category' and a float 'confidence'.
 31 |     """
 32 |     belongs_to_category: bool
 33 |     confidence: float
 34 | 
 35 | @log_function_call
 36 | def should_process(paper_details: dict, is_new_paper: bool) -> tuple[bool, float]:
 37 |     """
 38 |     Determine if a paper should be processed (notifications, posts, etc.)
 39 |     
 40 |     Args:
 41 |         paper_details (dict): Dictionary containing paper details
 42 |         is_new_paper (bool): Whether this is a new paper or an update
 43 |         
 44 |     Returns:
 45 |         tuple[bool, float]: (should_process, confidence)
 46 |             - should_process: True if paper should be processed
 47 |             - confidence: Model's confidence score
 48 |     """
 49 |     if not is_new_paper:
 50 |         return False, 0.0
 51 | 
 52 |     # Get semantic classification
 53 |     belongs, confidence = belongs_to_category(
 54 |         paper_details["paper_title"],
 55 |         paper_details["abstract_body"],
 56 |         DESIRED_CATEGORY
 57 |     )
 58 |     
 59 |     # Only process if both belongs is True AND confidence is high enough
 60 |     return belongs and confidence > 0.8, confidence
 61 | 
 62 | @log_function_call
 63 | def belongs_to_category(paper_title: str, paper_abstract: str, desired_category: str) -> tuple[bool, float]:
 64 |     """
 65 |     Determine if a paper belongs to a specific category using
 66 |     an OpenAI model that supports structured JSON outputs.
 67 |     
 68 |     Returns:
 69 |         tuple: (belongs_to_category: bool, confidence: float)
 70 |     """
 71 |     logger.info("Analyzing paper: '%s' for category '%s'", paper_title, desired_category)
 72 | 
 73 |     system_instructions = (
 74 |         "You are a research paper classifier. "
 75 |         "Given: a desired_category, a paper_title, and a paper_abstract, "
 76 |         "determine if the paper belongs to the desired_category. "
 77 |         "Output only valid JSON with the exact format: "
 78 |         "{ \"belongs_to_category\": boolean, \"confidence\": float }. "
 79 |         "Where 'belongs_to_category' is True if the paper belongs to the specified desired_category, "
 80 |         "otherwise False, and 'confidence' is a float between 0 and 1. No additional keys or text."
 81 |     )
 82 | 
 83 |     user_prompt = (
 84 |         f"desired_category: {desired_category}\n"
 85 |         f"paper_title: {paper_title}\n"
 86 |         f"paper_abstract: {paper_abstract}"
 87 |     )
 88 | 
 89 |     try:
 90 |         response = client.chat.completions.create(
 91 |             model="gpt-4o-mini",
 92 |             messages=[
 93 |                 {"role": "system", "content": system_instructions},
 94 |                 {"role": "user", "content": user_prompt},
 95 |             ],
 96 |             temperature=0.7
 97 |         )
 98 |         # Add detailed response logging
 99 |         logger.debug("Full API response: %s", response)
100 |         
101 |         message_content = response.choices[0].message.content.strip()
102 |         logger.debug("Raw message content: %s", message_content)
103 |         
104 |         if not message_content:
105 |             logger.error("Empty response from model")
106 |             return False, 0.0
107 |             
108 |         parsed_args = json.loads(message_content)
109 | 
110 |         classification = CategoryMatch(**parsed_args)
111 |         logger.info(
112 |             "Classification result: belongs=%s, confidence=%s",
113 |             classification.belongs_to_category,
114 |             classification.confidence
115 |         )
116 |         return classification.belongs_to_category, classification.confidence
117 | 
118 |     except (JSONDecodeError, ValidationError) as e:
119 |         logger.error("Error parsing or validating classification result: %s", e)
120 |         return False, 0.0
121 | 
122 | if __name__ == "__main__":
123 |     logger.info("Starting semantic filter test")
124 | 
125 |     # Test the classifier
126 |     TEST_TITLE = "Building Reliable LLM Agents: A Study in Reinforcement Learning"
127 |     TEST_ABSTRACT = (
128 |         "This paper explores methods for creating more reliable AI agents using LLMs and RL..."
129 |     )
130 |     CATEGORY = "LLM Agents"
131 |     result = belongs_to_category(TEST_TITLE, TEST_ABSTRACT, CATEGORY)
132 |     logger.info("Test result for category '%s': %s", CATEGORY, result)
133 | 
134 | # TODO: implement the Instructor library for structured outputs to enhance
135 | #  the flexibility of model switching
136 | # TODO: add examples to system prompt of abstract that are known to be 
137 | # in the category (agents)
138 | # TODO: store paper category relevance evaluations and confidence scores in the database 
139 | # to develop more accurate relevance response evaluations in the future 
140 | # using the OpenAI Evals platform.
141 | # TODO: implement error handling for OpenAI API credit exhaustion and send admin-only
142 | # notifications to Discord using discord_notifications.py's webhook. Research needed:
143 | # Discord webhook might not support role-based visibility (@admin mentions) directly - 
144 | # may need to create a separate admin-only channel or explore Discord bot implementation
145 | # TODO: swap out gpt-4o for gpt-4o-mini in the semantic filter to save on cost. 
146 | # ensure to get the correct model name from the API docs
147 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/x_post_v3.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import base64
  3 | import hashlib
  4 | import secrets
  5 | import webbrowser
  6 | from http.server import HTTPServer, BaseHTTPRequestHandler
  7 | from urllib.parse import parse_qs, urlparse
  8 | import requests
  9 | import logging
 10 | import re
 11 | from dotenv import load_dotenv
 12 | from requests_oauthlib import OAuth2Session
 13 | import time
 14 | 
 15 | # Configure logging
 16 | logging.basicConfig(
 17 |     level=logging.DEBUG,
 18 |     format='%(asctime)s - %(levelname)s - %(message)s'
 19 | )
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | # Load environment variables
 23 | load_dotenv()
 24 | 
 25 | # API endpoints - Updated to correct domains
 26 | MEDIA_ENDPOINT_URL = 'https://api.x.com/2/media/upload'
 27 | POST_TO_X_URL = 'https://api.x.com/2/tweets'
 28 | AUTH_URL = "https://twitter.com/i/oauth2/authorize"
 29 | TOKEN_URL = "https://api.twitter.com/2/oauth2/token"
 30 | CALLBACK_URL = "http://127.0.0.1:8000/callback"
 31 | 
 32 | class CallbackHandler(BaseHTTPRequestHandler):
 33 |     """Handle OAuth callback"""
 34 |     code = None
 35 |     
 36 |     def do_GET(self):
 37 |         """Process callback GET request"""
 38 |         query = parse_qs(urlparse(self.path).query)
 39 |         CallbackHandler.code = query.get('code', [None])[0]
 40 |         
 41 |         self.send_response(200)
 42 |         self.send_header('Content-type', 'text/html')
 43 |         self.end_headers()
 44 |         self.wfile.write(b"Authorization successful! You can close this window.")
 45 | 
 46 | class XPost:
 47 |     def __init__(self):
 48 |         client_id = os.getenv('X_OAUTH2_CLIENT_ID')
 49 |         client_secret = os.getenv('X_OAUTH2_CLIENT_SECRET')
 50 |         self.access_token = os.getenv('X_OAUTH2_ACCESS_TOKEN')
 51 |         refresh_token = os.getenv('X_OAUTH2_REFRESH_TOKEN')
 52 |         
 53 |         # Initialize OAuth2 session
 54 |         self.oauth = OAuth2Session(
 55 |             client_id,
 56 |             token={
 57 |                 'access_token': self.access_token,
 58 |                 'refresh_token': refresh_token,
 59 |                 'token_type': 'bearer'
 60 |             }
 61 |         )
 62 |         
 63 |         self.headers = {
 64 |             "Authorization": f"Bearer {self.access_token}",
 65 |             "Content-Type": "application/json",
 66 |             "User-Agent": "XPostBot"
 67 |         }
 68 | 
 69 |     def upload_image(self, image_path):
 70 |         """Upload an image using the v2 endpoint with multi-step process"""
 71 |         logger.info(f"Attempting to upload image: {image_path}")
 72 |         
 73 |         if not os.path.exists(image_path):
 74 |             raise FileNotFoundError(f"Image file not found: {image_path}")
 75 |         
 76 |         total_bytes = os.path.getsize(image_path)
 77 |         
 78 |         # Step 1: INIT
 79 |         init_data = {
 80 |             'command': 'INIT',
 81 |             'media_type': 'image/png',
 82 |             'total_bytes': total_bytes,
 83 |             'media_category': 'tweet_image'
 84 |         }
 85 |         
 86 |         response = self.oauth.post(MEDIA_ENDPOINT_URL, params=init_data, headers=self.headers)
 87 |         if response.status_code < 200 or response.status_code > 299:  # Accept any 2xx status
 88 |             logger.error(f"Media INIT failed: {response.text}")
 89 |             raise Exception(f"Failed to initialize media upload: {response.text}")
 90 |         
 91 |         media_id = response.json()['data']['id']
 92 |         
 93 |         # Step 2: APPEND
 94 |         with open(image_path, 'rb') as file:
 95 |             chunk = file.read()
 96 |             files = {'media': ('chunk', chunk, 'application/octet-stream')}
 97 |             data = {
 98 |                 'command': 'APPEND',
 99 |                 'media_id': media_id,
100 |                 'segment_index': 0
101 |             }
102 |             
103 |             response = self.oauth.post(MEDIA_ENDPOINT_URL, data=data, files=files)
104 |             if response.status_code < 200 or response.status_code > 299:  # Accept any 2xx status
105 |                 logger.error(f"Media APPEND failed: {response.text}")
106 |                 raise Exception(f"Failed to append media: {response.text}")
107 |         
108 |         # Step 3: FINALIZE
109 |         finalize_data = {
110 |             'command': 'FINALIZE',
111 |             'media_id': media_id
112 |         }
113 |         
114 |         response = self.oauth.post(MEDIA_ENDPOINT_URL, params=finalize_data, headers=self.headers)
115 |         if response.status_code < 200 or response.status_code > 299:  # Accept any 2xx status
116 |             logger.error(f"Media FINALIZE failed: {response.text}")
117 |             raise Exception(f"Failed to finalize media: {response.text}")
118 |         
119 |         logger.info(f"Successfully uploaded media with ID: {media_id}")
120 |         return media_id
121 | 
122 |     def create_tweet(self, text, media_id):
123 |         """Create a tweet with text and attached media"""
124 |         logger.info("Attempting to create tweet")
125 |         
126 |         payload = {
127 |             'text': text,
128 |             'media': {
129 |                 'media_ids': [media_id]
130 |             }
131 |         }
132 |         
133 |         response = self.oauth.post(POST_TO_X_URL, json=payload, headers=self.headers)
134 |         
135 |         logger.debug(f"Response status code: {response.status_code}")
136 |         logger.debug(f"Response headers: {response.headers}")
137 |         logger.debug(f"Response body: {response.text}")
138 | 
139 |         if response.status_code == 429:
140 |             reset_time = int(response.headers.get('x-app-limit-24hour-reset', 0))
141 |             wait_seconds = max(reset_time - int(time.time()), 0)
142 |             logger.warning(f"Rate limit exceeded. Waiting {wait_seconds} seconds...")
143 |             time.sleep(wait_seconds)
144 |             # Retry the request
145 |             return self.create_tweet(text, media_id)
146 |         elif response.status_code != 201:
147 |             logger.error(f"Tweet creation failed with status {response.status_code}")
148 |             logger.error(f"Response: {response.text}")
149 |             raise Exception(f"Failed to create tweet: {response.text}")
150 |         
151 |         return response.json()
152 | 
153 | def main():
154 |     try:
155 |         poster = XPost()
156 |         
157 |         # Use absolute path or correct relative path to your image
158 |         image_path = "test.png"  # Update this to your image path
159 |         logger.info(f"Starting process with image: {image_path}")
160 |         
161 |         # Upload image
162 |         media_id = poster.upload_image(image_path)
163 |         
164 |         # Create tweet with image
165 |         tweet_text = "Testing X API v2 with an image attachment! 🚀"
166 |         result = poster.create_tweet(tweet_text, media_id)
167 |         
168 |         logger.info("Tweet posted successfully!")
169 |         logger.info(f"Result: {result}")
170 |     
171 |     except Exception as e:
172 |         logger.error(f"Error: {str(e)}", exc_info=True)
173 |         raise
174 | 
175 | if __name__ == "__main__":
176 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a href="https://x.com/alxfazio" target="_blank">
  2 |   <picture>
  3 |     <source media="(prefers-color-scheme: dark)" srcset="images/firecrawl-quickstarts-github-cover.png">
  4 |     <img alt="Firecrawl Quickstarts Logo" src="images/firecrawl-quickstarts-github-cover.png" width="400px" style="max-width: 100%; margin-bottom: 20px;">
  5 |   </picture>
  6 | </a>
  7 | 
  8 | Firecrawl Quickstarts is an independent and unofficial collection of projects designed to help developers quickly get started with building applications using the Firecrawl API. Each quickstart provides a foundation that you can easily build upon and customize for your specific needs. This repository is not affiliated with, endorsed by, or officially supported by Firecrawl.
  9 | 
 10 | ## Getting Started
 11 | 
 12 | To use these quickstarts, you'll need a Firecrawl API key. If you don't have one yet, you can sign up for free at [firecrawl.dev](https://firecrawl.dev).
 13 | 
 14 | ## Available Quickstarts
 15 | 
 16 | ### Eventbrite AI Event Scout
 17 | 
 18 | Automated discovery of AI/ML events across major cities
 19 | 
 20 | - Scrapes events from 50+ global cities daily
 21 | - Confidence scoring for relevance filtering
 22 | - NLP filtering for relevant content
 23 | - Automated Discord alerts with event details
 24 | 
 25 | [Go to Eventbrite AI Event Scout](./events-scout-examples/eventbrite.ipynb)
 26 | 
 27 | ### Luma AI Event Discovery 
 28 | 
 29 | Automated tracking of tech meetups across 60+ global cities
 30 | 
 31 | - Scrapes and structures event data (titles/dates/locations)
 32 | - Confidence scoring for relevance filtering
 33 | - NLP filtering for relevant events
 34 | - Automated Discord alerts with event details
 35 | 
 36 | [Go to Luma AI Event Discovery](./events-scout-examples/luma.ipynb)
 37 | 
 38 | ### Firecrawl Web Crawling with OpenAI and Anthropic
 39 | 
 40 | This quickstart introduces how to integrate Firecrawl with OpenAI's Anthropic models to search and extract information based on specific user objectives. Learn to map a website, identify relevant pages, and retrieve content aligned with the objective. Ideal for targeted information gathering.
 41 | 
 42 | [Go to Firecrawl Web Crawling with OpenAI and Anthropic](./claude_researcher_with_map.ipynb)
 43 | 
 44 | ### Integrating OpenAI o1 Models with Firecrawl
 45 | 
 46 | Explore how to enhance the Firecrawl web crawling process with OpenAI’s o1 reasoning models. This quickstart guides you in using these advanced models to generate search parameters, map sites, and validate extracted content, enhancing the precision and relevance of data extraction.
 47 | 
 48 | [Go to Integrating OpenAI o1 Models with Firecrawl](./crawl_and_extract_with_openai_o1.ipynb)
 49 | 
 50 | ### Building a Web Crawler with Grok-2 and Firecrawl
 51 | 
 52 | Combine Grok-2’s AI-powered understanding with Firecrawl’s search to create an intelligent web crawler. This quickstart demonstrates building a targeted crawler that finds and processes structured data on web pages, with output in JSON format for seamless data handling.
 53 | 
 54 | [Go to Building a Web Crawler with Grok-2 and Firecrawl](./crawl_and_extract_with_xai_grok.ipynb)
 55 | 
 56 | ### Firecrawl Map Endpoint Quickstart
 57 | 
 58 | Learn how to use Firecrawl's Map endpoint to create comprehensive sitemaps from single URLs. This quickstart is perfect for efficiently gathering website structures, enabling tasks such as content mapping, SEO analysis, and scalable web data extraction.
 59 | 
 60 | [Go to Firecrawl Map Endpoint Quickstart](./firecrawl_map_endpoint_tutorial.ipynb)
 61 | 
 62 | ### Job Board Scraping with Firecrawl and OpenAI
 63 | 
 64 | Automate job listing extraction and analysis with Firecrawl and OpenAI’s Structured Outputs. This quickstart demonstrates scraping job boards, extracting structured job details, and matching listings to a user’s resume with schema-compliant outputs for reliable data processing.
 65 | 
 66 | [Go to Job Board Scraping with Firecrawl and OpenAI](./job_scraping_tutorial.ipynb)
 67 | 
 68 | ### Firecrawl LLM Extract Tutorial
 69 | 
 70 | Learn how to use Firecrawl’s LLM-powered data extraction features. This quickstart covers extracting structured data from web pages, with options for schema-defined and prompt-only extraction, making it adaptable for diverse data formats and applications.
 71 | 
 72 | [Go to Firecrawl LLM Extract Tutorial](./llm_extract_tutorial.ipynb)
 73 | 
 74 | ## General Usage
 75 | 
 76 | Each quickstart project is a Jupyter notebook designed to be easily opened and run on Google Colab. To get started, follow these steps:
 77 | 
 78 | 1. **Open the Repository in Google Colab**
 79 | 
 80 |    Each notebook has a link to open directly in Google Colab. Click on the link for the quickstart you want to explore.
 81 | 
 82 | 2. **Set Up Your Firecrawl API Key**
 83 | 
 84 |    Each notebook requires a Firecrawl API key. Once you've created your key (available [here](https://firecrawl.dev)), enter it in the notebook when prompted or set it as an environment variable as directed in the notebook.
 85 | 
 86 | 3. **Run Each Notebook Cell Sequentially**
 87 | 
 88 |    Follow the instructions within each notebook, running cells in order. The notebooks will guide you through each step, from setting up the environment to executing web scraping or extraction tasks.
 89 | 
 90 | 4. **View Results and Experiment**
 91 | 
 92 |    The notebooks are designed to be interactive. You can modify the code cells, adjust parameters, or try different objectives to explore Firecrawl’s capabilities further.
 93 | 
 94 | Each notebook includes explanations and usage examples to help you understand and customize your setup.
 95 | 
 96 | 
 97 | ## Explore Further
 98 | 
 99 | To deepen your understanding of working with Firecrawl and its API, check out these resources:
100 | 
101 | - [**Firecrawl Documentation**](https://docs.firecrawl.dev) - Comprehensive guides and API references
102 | - [**Firecrawl SDKs**](https://docs.firecrawl.dev/sdks/overview) - Explore our SDKs for [Python](https://docs.firecrawl.dev/sdks/python), [Node.js](https://docs.firecrawl.dev/sdks/node), [Go](https://docs.firecrawl.dev/sdks/go), and [Rust](https://docs.firecrawl.dev/sdks/rust)
103 | - [**LLM Framework Integrations**](https://docs.firecrawl.dev/integrations/overview) - Learn how to use Firecrawl with frameworks like LangChain and Llama Index
104 | - [**Firecrawl API Reference**](https://docs.firecrawl.dev/api-reference/introduction) - Detailed API endpoints and parameters
105 | 
106 | ## Contributing
107 | 
108 | We welcome contributions to the Firecrawl Quickstarts repository! If you have ideas for new quickstart projects or improvements to existing ones, please open an issue or submit a pull request.
109 | 
110 | ## Community and Support
111 | 
112 | - Join our [Firecrawl Discord community](https://discord.com/invite/gSmWdAkdwd) for discussions and support
113 | - Follow us on [Twitter](https://twitter.com/firecrawl_dev) and [LinkedIn](https://www.linkedin.com/company/104100957) for updates
114 | - Check out the [Firecrawl Support Documentation](https://docs.firecrawl.dev) for additional help
115 | 
116 | ## License
117 | 
118 | This project is licensed under the [MIT](https://opensource.org/licenses/MIT) License - see the [LICENSE](LICENSE) file for details.
119 | 
120 | ---
121 | 
122 | *It is the sole responsibility of the end users to respect websites' policies when scraping, searching, and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.*
123 | 
124 | [↑ Back to Top ↑](#firecrawl-quickstarts)
125 | 
126 | Copyright (c) 2024-present, Alex Fazio
127 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/x_post.py:
--------------------------------------------------------------------------------
  1 | """Module for posting research papers to X (Twitter)."""
  2 | 
  3 | import os
  4 | import base64
  5 | import hashlib
  6 | import secrets
  7 | import webbrowser
  8 | from http.server import HTTPServer, BaseHTTPRequestHandler
  9 | from urllib.parse import parse_qs, urlparse
 10 | from dotenv import load_dotenv, set_key, find_dotenv
 11 | import requests
 12 | from typing import Optional
 13 | from logging_config import setup_base_logging, log_function_call
 14 | 
 15 | # Configure logging using the centralized configuration
 16 | logger = setup_base_logging(
 17 |     logger_name="x_poster",
 18 |     log_file="x_poster.log",
 19 |     format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
 20 | )
 21 | 
 22 | load_dotenv()
 23 | 
 24 | AUTH_URL = "https://twitter.com/i/oauth2/authorize"
 25 | TOKEN_URL = "https://api.twitter.com/2/oauth2/token"
 26 | CALLBACK_URL = "http://127.0.0.1:8000/callback"
 27 | X_API_URL = "https://api.twitter.com/2/tweets"
 28 | 
 29 | @log_function_call
 30 | def generate_pkce_pair():
 31 |     """Generate PKCE code verifier and challenge"""
 32 |     code_verifier = secrets.token_urlsafe(64)
 33 |     code_challenge = base64.urlsafe_b64encode(
 34 |         hashlib.sha256(code_verifier.encode()).digest()
 35 |     ).rstrip(b'=').decode()
 36 |     
 37 |     logger.debug(f"Generated PKCE - Verifier: {len(code_verifier)} chars, Challenge: {len(code_challenge)} chars")
 38 |     return code_verifier, code_challenge
 39 | 
 40 | class CallbackHandler(BaseHTTPRequestHandler):
 41 |     """Handle OAuth callback"""
 42 |     code = None
 43 |     
 44 |     def log_message(self, format, *args):
 45 |         """Override to use our logger"""
 46 |         logger.info(f"OAuth Callback: {format%args}")
 47 |     
 48 |     def do_GET(self):
 49 |         """Process callback GET request"""
 50 |         logger.info(f"Received callback request: {self.path}")
 51 |         
 52 |         query = parse_qs(urlparse(self.path).query)
 53 |         CallbackHandler.code = query.get('code', [None])[0]
 54 |         
 55 |         if CallbackHandler.code:
 56 |             logger.info("Successfully received authorization code")
 57 |         else:
 58 |             logger.error(f"No authorization code in callback. Query params: {query}")
 59 |             
 60 |         # Log any error parameters
 61 |         if 'error' in query:
 62 |             logger.error(f"Error in callback: {query['error']}")
 63 |         if 'error_description' in query:
 64 |             logger.error(f"Error description: {query['error_description']}")
 65 |             
 66 |         self.send_response(200)
 67 |         self.send_header('Content-type', 'text/html')
 68 |         self.end_headers()
 69 |         self.wfile.write(b"Authorization successful! You can close this window.")
 70 |         
 71 | @log_function_call
 72 | def load_stored_tokens():
 73 |     """Load stored OAuth tokens from .env"""
 74 |     dotenv_path = find_dotenv()
 75 |     load_dotenv(dotenv_path)
 76 |     
 77 |     access_token = os.getenv('X_ACCESS_TOKEN')
 78 |     refresh_token = os.getenv('X_REFRESH_TOKEN')
 79 |     
 80 |     if access_token and refresh_token:
 81 |         logger.debug("Successfully loaded stored tokens")
 82 |         return {
 83 |             'access_token': access_token,
 84 |             'refresh_token': refresh_token,
 85 |             'expires_in': os.getenv('X_TOKEN_EXPIRES_IN'),
 86 |             'scope': os.getenv('X_TOKEN_SCOPE')
 87 |         }
 88 |     logger.warning("No stored tokens found")
 89 |     return None
 90 | 
 91 | @log_function_call
 92 | def save_tokens(tokens):
 93 |     """Save OAuth tokens to .env"""
 94 |     dotenv_path = find_dotenv()
 95 |     
 96 |     try:
 97 |         set_key(dotenv_path, 'X_ACCESS_TOKEN', tokens['access_token'])
 98 |         set_key(dotenv_path, 'X_REFRESH_TOKEN', tokens['refresh_token'])
 99 |         set_key(dotenv_path, 'X_TOKEN_EXPIRES_IN', str(tokens['expires_in']))
100 |         set_key(dotenv_path, 'X_TOKEN_SCOPE', tokens['scope'])
101 |         load_dotenv(dotenv_path)
102 |         logger.info("Successfully saved tokens to .env")
103 |     except Exception as e:
104 |         logger.error(f"Failed to save tokens: {str(e)}")
105 |         raise
106 | 
107 | def refresh_access_token(refresh_token):
108 |     """Get new access token using refresh token"""
109 |     auth = (
110 |         os.getenv('X_OAUTH2_CLIENT_ID'),
111 |         os.getenv('X_OAUTH2_CLIENT_SECRET')
112 |     )
113 |     
114 |     data = {
115 |         'refresh_token': refresh_token,
116 |         'grant_type': 'refresh_token'
117 |     }
118 |     
119 |     response = requests.post(TOKEN_URL, auth=auth, data=data)
120 |     return response.json()
121 | 
122 | def get_oauth2_token():
123 |     """Get OAuth 2.0 token, using stored tokens if available"""
124 |     # Try to load stored tokens
125 |     tokens = load_stored_tokens()
126 |     
127 |     if tokens:
128 |         logger.debug(f"Found stored tokens with keys: {list(tokens.keys())}")
129 |         if 'refresh_token' in tokens:
130 |             try:
131 |                 logger.info("Attempting to refresh token...")
132 |                 new_tokens = refresh_access_token(tokens['refresh_token'])
133 |                 logger.debug(f"Refresh response keys: {list(new_tokens.keys())}")
134 |                 save_tokens(new_tokens)
135 |                 return new_tokens['access_token']
136 |             except Exception as e:
137 |                 logger.error(f"Token refresh failed: {e}")
138 |         else:
139 |             logger.error("Stored tokens missing refresh_token")
140 |     
141 |     # If no stored tokens or refresh failed, do full authorization
142 |     logger.info("Starting OAuth 2.0 PKCE flow")
143 |     code_verifier, code_challenge = generate_pkce_pair()
144 |     
145 |     auth_params = {
146 |         'response_type': 'code',
147 |         'client_id': os.getenv('X_OAUTH2_CLIENT_ID'),
148 |         'redirect_uri': CALLBACK_URL,
149 |         'scope': 'tweet.write tweet.read users.read offline.access',
150 |         'code_challenge': code_challenge,
151 |         'code_challenge_method': 'S256',
152 |         'state': secrets.token_urlsafe(32)
153 |     }
154 |     
155 |     logger.info("Starting local callback server")
156 |     server = HTTPServer(('127.0.0.1', 8000), CallbackHandler)
157 |     
158 |     auth_url = f"{AUTH_URL}?{'&'.join(f'{k}={v}' for k,v in auth_params.items())}"
159 |     logger.info(f"Opening authorization URL: {auth_url}")
160 |     webbrowser.open(auth_url)
161 |     
162 |     logger.info("Waiting for callback...")
163 |     server.handle_request()
164 |     
165 |     if not CallbackHandler.code:
166 |         raise Exception("Failed to get authorization code")
167 |     
168 |     logger.info("Exchanging code for token")
169 |     token_data = {
170 |         'code': CallbackHandler.code,
171 |         'grant_type': 'authorization_code',
172 |         'client_id': os.getenv('X_OAUTH2_CLIENT_ID'),
173 |         'redirect_uri': CALLBACK_URL,
174 |         'code_verifier': code_verifier
175 |     }
176 |     
177 |     auth = (
178 |         os.getenv('X_OAUTH2_CLIENT_ID'),
179 |         os.getenv('X_OAUTH2_CLIENT_SECRET')
180 |     )
181 |     
182 |     response = requests.post(TOKEN_URL, auth=auth, data=token_data)
183 |     if response.status_code != 200:
184 |         raise Exception(f"Token exchange failed: {response.text}")
185 |         
186 |     token_json = response.json()
187 |     logger.debug(f"Received token response with keys: {list(token_json.keys())}")
188 |     save_tokens(token_json)
189 |     return token_json['access_token']
190 | 
191 | def format_post(
192 |     paper_title: str,
193 |     authors: list,
194 |     url: str,
195 |     pdf_url: Optional[str] = None,
196 |     arxiv_url: Optional[str] = None,
197 |     github_url: Optional[str] = None
198 | ) -> str:
199 |     """Format paper details into X post text"""
200 |     # Start with title and truncate if needed
201 |     post = f"📚 {paper_title[:100]}{'...' if len(paper_title) > 100 else ''}\n\n"
202 |     
203 |     # Add authors (limited to first 2 if many)
204 |     if len(authors) > 2:
205 |         authors_text = f"by {', '.join(authors[:2])} et al."
206 |     else:
207 |         authors_text = f"by {', '.join(authors)}"
208 |     post += f"{authors_text}\n\n"
209 |     
210 |     # Add links
211 |     post += f"🔗 {url}"
212 |     if pdf_url:
213 |         post += f"\n📄 {pdf_url}"
214 |     if arxiv_url:
215 |         post += f"\n📝 {arxiv_url}"
216 |     if github_url:
217 |         post += f"\n💻 {github_url}"
218 |         
219 |     return post
220 | 
221 | def post_paper(
222 |     paper_title: str,
223 |     authors: list,
224 |     url: str,
225 |     pdf_url: Optional[str] = None,
226 |     arxiv_url: Optional[str] = None,
227 |     github_url: Optional[str] = None
228 | ):
229 |     """Post paper to X"""
230 |     logger.info(f"Attempting to post paper: {paper_title}")
231 |     
232 |     # Get OAuth 2.0 token
233 |     try:
234 |         token = get_oauth2_token()
235 |         logger.info("Successfully obtained OAuth token")
236 |     except Exception as e:
237 |         logger.error(f"Failed to get OAuth token: {str(e)}")
238 |         return None
239 |     
240 |     post_text = format_post(
241 |         paper_title, authors, url, 
242 |         pdf_url, arxiv_url, github_url
243 |     )
244 |     
245 |     headers = {
246 |         'Authorization': f'Bearer {token}',
247 |         'Content-Type': 'application/json'
248 |     }
249 |     
250 |     payload = {"text": post_text}
251 |     
252 |     try:
253 |         logger.info("Sending post request to X API")
254 |         response = requests.post(X_API_URL, json=payload, headers=headers)
255 |         
256 |         logger.debug(f"Post response status: {response.status_code}")
257 |         logger.debug(f"Post response headers: {response.headers}")
258 |         
259 |         if response.status_code != 201:
260 |             logger.error(f"Error posting to X: {response.text}")
261 |         else:
262 |             logger.info("Successfully posted to X")
263 |             
264 |         return response.json()
265 |     except Exception as e:
266 |         logger.error(f"Error posting to X: {str(e)}", exc_info=True)
267 |         return None
268 | 
269 | if __name__ == "__main__":
270 |     logger.info("Starting X post test")
271 |     try:
272 |         response = post_paper(
273 |             paper_title="Test Paper Title",
274 |             authors=["Author 1", "Author 2"],
275 |             url="https://huggingface.co/papers/test",
276 |             pdf_url="https://example.com/test.pdf",
277 |             arxiv_url="https://arxiv.org/abs/test",
278 |             github_url="https://github.com/test/repo"
279 |         )
280 |         
281 |         if response and 'data' in response:
282 |             logger.info("✅ Post successful!")
283 |             logger.info(f"Tweet ID: {response['data']['id']}")
284 |             logger.info(f"Tweet text: {response['data']['text']}")
285 |         else:
286 |             logger.error("❌ Post failed!")
287 |             logger.error(f"Response: {response}")
288 |             
289 |     except Exception as e:
290 |         logger.error("❌ Error during posting:")
291 |         logger.error(f"Error details: {str(e)}", exc_info=True)


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/tests/test_semantic_filter.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """Module for testing the semantic filter."""
  2 | 
  3 | import os
  4 | import sys
  5 | 
  6 | # Add the project root to the Python path
  7 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../"))
  8 | sys.path.insert(0, project_root)
  9 | 
 10 | from examples.firecrawl_automated_whitepaper_tracking.semantic_filter import belongs_to_category
 11 | from examples.firecrawl_automated_whitepaper_tracking.category_prompt import DESIRED_CATEGORY
 12 | 
 13 | def test_belongs_to_category():
 14 |     """
 15 |     This test ensures that belongs_to_category returns a boolean indicating if the paper 
 16 |     likely belongs to the specified category, based on the model's classification.
 17 |     Also prints confidence scores for analysis.
 18 |     """
 19 | 
 20 |     # Our test cases with inputs and expected outputs
 21 |     test_cases = [
 22 |         (
 23 |             "PaliGemma 2: A Family of Versatile VLMs for Transfer",
 24 |             """PaliGemma 2 is an upgrade of the PaliGemma open Vision-Language Model (VLM) based on the Gemma 2 family of language models. We combine the SigLIP-So400m vision encoder that was also used by PaliGemma with the whole range of Gemma 2 models, from the 2B one all the way up to the 27B model. We train these models at three resolutions (224px, 448px, and 896px) in multiple stages to equip them with broad knowledge for transfer via fine-tuning. The resulting family of base models covering different model sizes and resolutions allows us to investigate factors impacting transfer performance (such as learning rate) and to analyze the interplay between the type of task, model size, and resolution. We further increase the number and breadth of transfer tasks beyond the scope of PaliGemma including different OCR-related tasks such as table structure recognition, molecular structure recognition, music score recognition, as well as long fine-grained captioning and radiography report generation, on which PaliGemma 2 obtains state-of-the-art results.""",
 25 |             DESIRED_CATEGORY,
 26 |             False
 27 |         ),
 28 |         (
 29 |             "From Generation to Judgment: Opportunities and Challenges of LLM-as-a-judge",
 30 |             """Assessment and evaluation have long been critical challenges in artificial intelligence (AI) and natural language processing (NLP). However, traditional methods, whether matching-based or embedding-based, often fall short of judging subtle attributes and delivering satisfactory results. Recent advancements in Large Language Models (LLMs) inspire the "LLM-as-a-judge" paradigm, where LLMs are leveraged to perform scoring, ranking, or selection across various tasks and applications. This paper provides a comprehensive survey of LLM-based judgment and assessment, offering an in-depth overview to advance this emerging field. We begin by giving detailed definitions from both input and output perspectives. Then we introduce a comprehensive taxonomy to explore LLM-as-a-judge from three dimensions: what to judge, how to judge and where to judge. Finally, we compile benchmarks for evaluating LLM-as-a-judge and highlight key challenges and promising directions, aiming to provide valuable insights and inspire future research in this promising research area. Paper list and more resources about LLM-as-a-judge can be found at https://github.com/llm-as-a-judge/Awesome-LLM-as-a-judge and https://llm-as-a-judge.github.io.""",
 31 |             DESIRED_CATEGORY,
 32 |             False
 33 |         ),
 34 |         (
 35 |             "Evaluation Agent: Efficient and Promptable Evaluation Framework for Visual Generative Models",
 36 |             """Recent advancements in visual generative models have enabled high-quality image and video generation, opening diverse applications. However, evaluating these models often demands sampling hundreds or thousands of images or videos, making the process computationally expensive, especially for diffusion-based models with inherently slow sampling. Moreover, existing evaluation methods rely on rigid pipelines that overlook specific user needs and provide numerical results without clear explanations. In contrast, humans can quickly form impressions of a model's capabilities by observing only a few samples. To mimic this, we propose the Evaluation Agent framework, which employs human-like strategies for efficient, dynamic, multi-round evaluations using only a few samples per round, while offering detailed, user-tailored analyses. It offers four key advantages: 1) efficiency, 2) promptable evaluation tailored to diverse user needs, 3) explainability beyond single numerical scores, and 4) scalability across various models and tools. Experiments show that Evaluation Agent reduces evaluation time to 10% of traditional methods while delivering comparable results. The Evaluation Agent framework is fully open-sourced to advance research in visual generative models and their efficient evaluation.""",
 37 |             DESIRED_CATEGORY,
 38 |             True
 39 |         ),
 40 |         (
 41 |             "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks",
 42 |             """We interact with computers on an everyday basis, be it in everyday life or work, and many aspects of work can be done entirely with access to a computer and the Internet. At the same time, thanks to improvements in large language models (LLMs), there has also been a rapid development in AI agents that interact with and affect change in their surrounding environments. But how performant are AI agents at helping to accelerate or even autonomously perform work-related tasks? The answer to this question has important implications for both industry looking to adopt AI into their workflows, and for economic policy to understand the effects that adoption of AI may have on the labor market. To measure the progress of these LLM agents' performance on performing real-world professional tasks, in this paper, we introduce TheAgentCompany, an extensible benchmark for evaluating AI agents that interact with the world in similar ways to those of a digital worker: by browsing the Web, writing code, running programs, and communicating with other coworkers. We build a self-contained environment with internal web sites and data that mimics a small software company environment, and create a variety of tasks that may be performed by workers in such a company. We test baseline agents powered by both closed API-based and open-weights language models (LMs), and find that with the most competitive agent, 24% of the tasks can be completed autonomously. This paints a nuanced picture on task automation with LM agents -- in a setting simulating a real workplace, a good portion of simpler tasks could be solved autonomously, but more difficult long-horizon tasks are still beyond the reach of current systems.""",
 43 |             DESIRED_CATEGORY,
 44 |             True
 45 |         ),
 46 |         (
 47 |             "GUI Agents: A Survey",
 48 |             """Graphical User Interface (GUI) agents, powered by Large Foundation Models, have emerged as a transformative approach to automating human-computer interaction. These agents autonomously interact with digital systems or software applications via GUIs, emulating human actions such as clicking, typing, and navigating visual elements across diverse platforms. Motivated by the growing interest and fundamental importance of GUI agents, we provide a comprehensive survey that categorizes their benchmarks, evaluation metrics, architectures, and training methods. We propose a unified framework that delineates their perception, reasoning, planning, and acting capabilities. Furthermore, we identify important open challenges and discuss key future directions. Finally, this work serves as a basis for practitioners and researchers to gain an intuitive understanding of current progress, techniques, benchmarks, and critical open problems that remain to be addressed.""",
 49 |             DESIRED_CATEGORY,
 50 |             True
 51 |         ),
 52 |         (
 53 |             "Aguvis: Unified Pure Vision Agents for Autonomous GUI Interaction",
 54 |             """Graphical User Interfaces (GUIs) are critical to human-computer interaction, yet automating GUI tasks remains challenging due to the complexity and variability of visual environments. Existing approaches often rely on textual representations of GUIs, which introduce limitations in generalization, efficiency, and scalability. In this paper, we introduce Aguvis, a unified pure vision-based framework for autonomous GUI agents that operates across various platforms. Our approach leverages image-based observations, and grounding instructions in natural language to visual elements, and employs a consistent action space to ensure cross-platform generalization. To address the limitations of previous work, we integrate explicit planning and reasoning within the model, enhancing its ability to autonomously navigate and interact with complex digital environments. We construct a large-scale dataset of GUI agent trajectories, incorporating multimodal reasoning and grounding, and employ a two-stage training pipeline that first focuses on general GUI grounding, followed by planning and reasoning. Through comprehensive experiments, we demonstrate that Aguvis surpasses previous state-of-the-art methods in both offline and real-world online scenarios, achieving, to our knowledge, the first fully autonomous pure vision GUI agent capable of performing tasks independently without collaboration with external closed-source models. We open-sourced all datasets, models, and training recipes to facilitate future research at https://aguvis-project.github.io/.""",
 55 |             DESIRED_CATEGORY,
 56 |             True
 57 |         ),
 58 |     ]
 59 | 
 60 |     # Track failed tests
 61 |     failed_tests = []
 62 | 
 63 |     # Run each test case
 64 |     for paper_title, paper_abstract, desired_category, expected_boolean in test_cases:
 65 |         try:
 66 |             # Get the raw response from the model
 67 |             result, confidence = belongs_to_category(paper_title, paper_abstract, desired_category)
 68 |             
 69 |             # Print the confidence score and classification result
 70 |             print(f"\nPaper: {paper_title[:50]}...")
 71 |             print(f"Expected category match: {expected_boolean}")
 72 |             print(f"Actual category match: {result}")
 73 |             print(f"Confidence score: {confidence:.2f}")
 74 |             
 75 |             assert isinstance(result, bool), "The result should be a boolean."
 76 |             if result != expected_boolean:
 77 |                 failed_tests.append({
 78 |                     'title': paper_title[:50],
 79 |                     'expected': expected_boolean,
 80 |                     'got': result
 81 |                 })
 82 |                 
 83 |         except Exception as e:
 84 |             failed_tests.append({
 85 |                 'title': paper_title[:50],
 86 |                 'error': str(e)
 87 |             })
 88 | 
 89 |     # Print summary at the end
 90 |     print("\n=== Test Summary ===")
 91 |     if not failed_tests:
 92 |         print("All tests passed successfully!")
 93 |     else:
 94 |         print(f"Failed tests ({len(failed_tests)}):")
 95 |         for test in failed_tests:
 96 |             if 'error' in test:
 97 |                 print(f"- {test['title']}: {test['error']}")
 98 |             else:
 99 |                 print(f"- {test['title']}: expected {test['expected']}, got {test['got']}")
100 |         raise AssertionError("Some tests failed. See summary above.")
101 | 
102 | if __name__ == "__main__":
103 |     test_belongs_to_category()
104 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/firecrawl_crawl_extract.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """Module for crawling and extracting data from Hugging Face papers using Firecrawl.
  2 | Module for crawling and extracting data from Hugging Face papers using Firecrawl.
  3 | Handles paper metadata extraction and processing for the notification system.
  4 | """
  5 | 
  6 | import asyncio
  7 | import os
  8 | import re
  9 | from datetime import datetime
 10 | from typing import Dict, Any
 11 | 
 12 | # Third-party imports
 13 | import pytz
 14 | import requests
 15 | from sqlalchemy.exc import SQLAlchemyError
 16 | from pydantic import BaseModel
 17 | from firecrawl import FirecrawlApp
 18 | from dotenv import load_dotenv
 19 | from supabase_db import Database
 20 | from semantic_filter import should_process
 21 | from discord_notifications import send_paper_notification
 22 | from x_post import post_paper
 23 | from logging_config import setup_crawler_logging
 24 | 
 25 | # Initialize logger
 26 | logger = setup_crawler_logging()
 27 | 
 28 | # Load environment variables
 29 | load_dotenv()
 30 | 
 31 | # Validate required environment variables
 32 | if not os.getenv("POSTGRES_URL"):
 33 |     raise ValueError("POSTGRES_URL environment variable not set")
 34 | if not os.getenv("FIRECRAWL_API_KEY"):
 35 |     raise ValueError("FIRECRAWL_API_KEY environment variable not set")
 36 | 
 37 | def extract_paper_urls(target_url: str) -> list:
 38 |     """
 39 |     Extract all paper source URLs from a given target URL using Firecrawl.
 40 |     
 41 |     Args:
 42 |         target_url (str): The URL to crawl for paper sources
 43 |         
 44 |     Returns:
 45 |         list: A list of extracted source URLs, excluding daily papers URLs
 46 |     """
 47 |     logger.info("Starting URL extraction from: %s", target_url)
 48 |     exclude_url_pattern = (
 49 |         r"^https://huggingface\.co/papers\?date="
 50 |         r"\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$"
 51 |     )
 52 |     def get_all_source_urls(json_data: Dict[str, Any]) -> list:
 53 |         extracted_urls = []
 54 |         logger.debug("Processing JSON data with %d entries", len(json_data.get('data', [])))
 55 |         if "data" in json_data:
 56 |             for entry in json_data["data"]:
 57 |                 if "metadata" in entry and "sourceURL" in entry["metadata"]:
 58 |                     url = entry["metadata"]["sourceURL"]
 59 |                     if not re.match(exclude_url_pattern, url):
 60 |                         extracted_urls.append(url)
 61 |         if "next" in json_data and json_data["next"]:
 62 |             logger.debug("Found next page: %s", json_data['next'])
 63 |             next_page_url = json_data["next"]
 64 |             response = requests.get(next_page_url)  # noqa
 65 |             if response.ok:
 66 |                 next_json_data = response.json()
 67 |                 extracted_urls.extend(get_all_source_urls(next_json_data))
 68 |         return extracted_urls
 69 | 
 70 |     load_dotenv()
 71 |     api_key = os.getenv("FIRECRAWL_API_KEY")
 72 |     app = FirecrawlApp(api_key=api_key)
 73 |     params = {
 74 |         'limit': 30,
 75 |         'excludePaths': ['papers$'],
 76 |         'includePaths': ['papers/*'],
 77 |         'ignoreSitemap': True,
 78 |         'scrapeOptions': {
 79 |             'formats': ['markdown', 'links', 'html'],
 80 |             'onlyMainContent': False,
 81 |             'includeTags': ['a']
 82 |         }
 83 |     }
 84 |     logger.info("Crawling URL with params: %s", params)
 85 |     crawl_result = app.crawl_url(target_url, params=params)
 86 |     urls = get_all_source_urls(crawl_result)
 87 |     logger.info("Extracted %d paper URLs", len(urls))
 88 |     return urls
 89 | 
 90 | async def extract_paper_details(url: str) -> dict:
 91 |     """Extract paper details from a given URL using FirecrawlApp.
 92 |     
 93 |     This async function handles the extraction of metadata from individual paper pages.
 94 |     It uses the FirecrawlApp to scrape structured data according to the ExtractSchema.
 95 |     The synchronous FirecrawlApp calls are run in a separate thread using asyncio.to_thread.
 96 |     
 97 |     Args:
 98 |         url (str): The URL of the paper to extract details from.
 99 |         
100 |     Returns:
101 |         dict: Extracted paper details including title, upvotes, comments, and URLs.
102 |     """
103 |     logger.info("Extracting paper details from: %s", url)
104 |     # Initialize the FirecrawlApp with your API key
105 |     app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
106 | 
107 |     class ExtractSchema(BaseModel):  # noqa
108 |         """Schema for extracting paper details from Hugging Face papers."""
109 |         paper_title: str
110 |         number_of_upvotes: int
111 |         number_of_comments: int
112 |         view_pdf_url: str
113 |         view_arxiv_page_url: str
114 |         authors: str
115 |         abstract_body: str
116 |         # Publication date represents when the paper was originally published (e.g., on arXiv)
117 |         utc_publication_date_day: int
118 |         utc_publication_date_month: int
119 |         utc_publication_date_year: int
120 |         # Submission date represents when the paper was submitted/added to
121 |         # HuggingFace's daily papers page
122 |         utc_submission_date_day: int
123 |         utc_submission_date_month: int
124 |         utc_submission_date_year: int
125 |         github_repo_url: str
126 | 
127 |     # Run the synchronous scrape_url method in a separate thread
128 |     data = await asyncio.to_thread(
129 |         app.scrape_url,
130 |         url,
131 |         {
132 |             'formats': ['extract'],
133 |             'extract': {
134 |                 'schema': ExtractSchema.model_json_schema(),
135 |             }
136 |         }
137 |     )
138 |     logger.debug("Raw extraction data: %s", data['extract'])
139 |     return data['extract']
140 | 
141 | async def process_paper_batch(urls: list[str], db: Database, batch_size: int = 5):
142 |     """Process papers in batches to avoid overwhelming resources"""
143 |     for i in range(0, len(urls), batch_size):
144 |         batch = urls[i:i + batch_size]
145 |         tasks = []
146 |         for url in batch:
147 |             tasks.append(extract_paper_details(url))
148 |         
149 |         details_list = await asyncio.gather(*tasks, return_exceptions=True)
150 |         
151 |         for url, details in zip(batch, details_list):
152 |             current_time = datetime.now()
153 |             paper_data = {
154 |                 "url": url,
155 |                 "extraction_success": True,
156 |                 "extraction_error": None,
157 |                 "last_extraction_attempt": current_time,
158 |                 "notification_sent": False
159 |             }
160 |             
161 |             if isinstance(details, Exception):
162 |                 logger.error(f"Error processing {url}: {details}")
163 |                 paper_data.update({
164 |                     "extraction_success": False,
165 |                     "extraction_error": str(details)
166 |                 })
167 |                 try:
168 |                     db.add_paper(paper_data)
169 |                 except SQLAlchemyError as e:
170 |                     logger.error(f"Database error storing failed paper {url}: {e}")
171 |                 continue
172 | 
173 |             try:
174 |                 paper_data.update(details)
175 |                 is_new_paper = db.add_paper(paper_data)
176 |                 
177 |                 # Use should_process from semantic_filter
178 |                 should_process_paper, confidence = should_process(details, is_new_paper)
179 |                 
180 |                 if should_process_paper:
181 |                     # Send Discord notification
182 |                     notification_success = await send_paper_notification(
183 |                         paper_title=details["paper_title"],
184 |                         authors=details["authors"].split(", "),
185 |                         abstract=details["abstract_body"],
186 |                         upvotes=details["number_of_upvotes"],
187 |                         comments=details["number_of_comments"],
188 |                         url=url,
189 |                         pdf_url=details["view_pdf_url"],
190 |                         arxiv_url=details["view_arxiv_page_url"],
191 |                         github_url=details["github_repo_url"]
192 |                     )
193 |                     
194 |                     if notification_success:
195 |                         try:
196 |                             db.update_notification_status(url, True)
197 |                         except SQLAlchemyError as e:
198 |                             logger.error(f"Failed to update notification status for {url}: {e}")
199 |                     
200 |                     # Post to X
201 |                     try:
202 |                         x_response = post_paper(
203 |                             paper_title=details["paper_title"],
204 |                             authors=details["authors"].split(", "),
205 |                             url=url,
206 |                             pdf_url=details["view_pdf_url"],
207 |                             arxiv_url=details["view_arxiv_page_url"],
208 |                             github_url=details["github_repo_url"]
209 |                         )
210 |                         if x_response and 'data' in x_response:
211 |                             logger.info(f"Successfully posted paper to X: {url}")
212 |                             # TODO: Update x_post_sent status in DB once column is added
213 |                         else:
214 |                             logger.error(f"Failed to post paper to X: {url}")
215 |                     except Exception as e:
216 |                         logger.error(f"Error posting to X for {url}: {e}")
217 |                     
218 |             except Exception as e:
219 |                 logger.error(f"Error processing details for {url}: {e}")
220 |                 paper_data.update({
221 |                     "extraction_success": False,
222 |                     "extraction_error": str(e)
223 |                 })
224 |                 try:
225 |                     db.add_paper(paper_data)
226 |                 except SQLAlchemyError as db_error:
227 |                     logger.error(f"Database error storing error state for {url}: {db_error}")
228 | 
229 | def get_todays_papers_url() -> str:
230 |     """
231 |     Returns today's HuggingFace papers URL using San Francisco timezone.
232 |     
233 |     Returns:
234 |         str: URL for today's papers webpage in format https://huggingface.co/papers?date=YYYY-MM-DD
235 |     """
236 |     sf_tz = pytz.timezone('America/Los_Angeles')
237 |     today = datetime.now(sf_tz).strftime('%Y-%m-%d')
238 |     return f"https://huggingface.co/papers?date={today}"
239 | 
240 | # TODO: create a streamlit ui to set environment variables and desired categories for the semantic filter
241 | # TODO: make the extract_paper_details function async so details are extracted in parallel
242 | # TODO: make all functions async to avoid redudant code
243 | # TODO: for each url extracted by the crawler it should be verified if it exists already in 
244 | # the database before passing it to the extract_paper_details function
245 | # TODO: update the extract_paper_details function to process new papers only if the number of
246 | # found papers for the specific date is greater than the number of papers already found for that date
247 | # within the database. so this will need a new specific database table to store the number of papers
248 | # found for each date.
249 | # TODO: implement an improve error handling system for the extract_paper_details function which will
250 | # add papers that failed to be processed to the database but include a column to indicate that the
251 | # paper was not processed successfully, so that a retry can be performed when cron jobs are re-run.
252 | # this will require changing the database structure and schema, as well as some of the existing logic
253 | # that verifies if a paper should be processed or not.
254 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/supabase_db.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """Module for interacting with the supabase database using SQLAlchemy."""
  2 | 
  3 | from datetime import datetime, timedelta
  4 | from sqlalchemy import (
  5 |     create_engine, Column, String, Integer, DateTime, Text, ARRAY, text, Boolean
  6 | )
  7 | from sqlalchemy.orm import sessionmaker, declarative_base
  8 | from logging_config import setup_database_logging
  9 | from sqlalchemy.exc import SQLAlchemyError
 10 | 
 11 | # Configure logging using centralized configuration
 12 | logger = setup_database_logging()
 13 | 
 14 | Base = declarative_base()
 15 | 
 16 | class Paper(Base):
 17 |     """SQLAlchemy model for storing research papers from the Hugging Face daily papers page.
 18 |     Each paper entry includes its URL, title, authors, abstract, associated URLs (PDF, arXiv, GitHub),
 19 |     publication and submission dates, as well as current engagement metrics (upvotes and comments)."""
 20 |     __tablename__ = "papers"
 21 |     url = Column(String, primary_key=True)
 22 |     title = Column(String, nullable=False)
 23 |     authors = Column(ARRAY(String), nullable=False)
 24 |     abstract = Column(Text, nullable=False)
 25 |     pdf_url = Column(String)
 26 |     arxiv_url = Column(String)
 27 |     github_url = Column(String)
 28 |     publication_date = Column(DateTime, nullable=False)
 29 |     submission_date = Column(DateTime, nullable=False)
 30 |     upvotes = Column(Integer, default=0)
 31 |     comments = Column(Integer, default=0)
 32 |     last_updated = Column(DateTime, default=datetime.now, onupdate=datetime.now)
 33 |     notification_sent = Column(Boolean, default=False)
 34 |     extraction_success = Column(Boolean, default=True)
 35 |     extraction_error = Column(Text, nullable=True)
 36 |     last_extraction_attempt = Column(DateTime, default=datetime.now)
 37 | 
 38 | 
 39 | class Database:
 40 |     """Class for interacting with the database using SQLAlchemy."""
 41 |     CURRENT_SCHEMA_VERSION = 2
 42 | 
 43 |     def __init__(self, connection_string, skip_version_check=False):
 44 |         logger.info("Initializing Database connection")
 45 |         if not connection_string:
 46 |             logger.error("Database connection string is not set")
 47 |             raise ValueError("Database connection string is not set")
 48 | 
 49 |         # Ensure sslmode=require is appended
 50 |         if '?' not in connection_string:
 51 |             connection_string += '?sslmode=require'
 52 |         elif 'sslmode' not in connection_string:
 53 |             connection_string += '&sslmode=require'
 54 | 
 55 |         logger.debug("Creating engine with connection string: %s", connection_string.split('?')[0])
 56 |         self.engine = create_engine(
 57 |             connection_string,
 58 |             pool_pre_ping=True  # Pre-ping helps keep connections alive
 59 |         )
 60 | 
 61 |         logger.info("Creating database tables if they don't exist")
 62 |         Base.metadata.create_all(self.engine)
 63 |         self.session_factory = sessionmaker(bind=self.engine)
 64 |         logger.info("Database initialization complete")
 65 | 
 66 |         # Only check version if not skipped
 67 |         if not skip_version_check:
 68 |             self._check_schema_version()
 69 |         
 70 |     def _check_schema_version(self):
 71 |         """Verify database schema version is compatible."""
 72 |         session = self.session_factory()
 73 |         try:
 74 |             # Create version table if it doesn't exist
 75 |             session.execute(text("""
 76 |                 CREATE TABLE IF NOT EXISTS schema_version (
 77 |                     version INTEGER PRIMARY KEY,
 78 |                     applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 79 |                 )
 80 |             """))
 81 |             
 82 |             # Get current version
 83 |             result = session.execute(text(
 84 |                 "SELECT version FROM schema_version ORDER BY version DESC LIMIT 1"
 85 |             ))
 86 |             db_version = result.scalar() or 0
 87 |             
 88 |             if db_version == 0:
 89 |                 # New database, set initial version
 90 |                 session.execute(text(
 91 |                     "INSERT INTO schema_version (version) VALUES (:version)"
 92 |                 ), {"version": self.CURRENT_SCHEMA_VERSION})
 93 |                 session.commit()
 94 |             elif db_version < self.CURRENT_SCHEMA_VERSION:
 95 |                 logger.error(
 96 |                     "Database schema version %d is older than required version %d. "
 97 |                     "Please run migrations.",
 98 |                     db_version, self.CURRENT_SCHEMA_VERSION
 99 |                 )
100 |                 raise RuntimeError("Database schema needs migration")
101 |             elif db_version > self.CURRENT_SCHEMA_VERSION:
102 |                 logger.error(
103 |                     "Database schema version %d is newer than supported version %d. "
104 |                     "Please update the application.",
105 |                     db_version, self.CURRENT_SCHEMA_VERSION
106 |                 )
107 |                 raise RuntimeError("Database schema version not supported")
108 |                 
109 |             logger.info("Database schema version: %d", db_version)
110 |             
111 |         except Exception as e:
112 |             logger.error("Error checking schema version: %s", str(e))
113 |             raise
114 |         finally:
115 |             session.close()
116 | 
117 |     def get_all_papers(self):
118 |         """Get all papers from the database"""
119 |         logger.info("Fetching all papers from database")
120 |         session = self.session_factory()
121 |         try:
122 |             papers = session.query(Paper).all()
123 |             logger.info("Retrieved %d papers from database", len(papers))
124 |             return papers
125 |         except Exception as e:
126 |             logger.error("Error fetching papers: %s", str(e))
127 |             raise
128 |         finally:
129 |             session.close()
130 | 
131 |     def add_paper(self, paper_data):
132 |         """Add or update a paper and its current metrics.
133 |         
134 |         Returns:
135 |             bool: True if this is a new paper, False if it's an update
136 |         """
137 |         logger.info("Adding/updating paper: %s", paper_data['url'])
138 |         session = self.session_factory()
139 |         try:
140 |             # Check if paper already exists
141 |             existing_paper = session.query(Paper).filter(
142 |                 Paper.url == paper_data["url"]
143 |             ).first()
144 |             
145 |             is_new_paper = existing_paper is None
146 |             action = "Adding new" if is_new_paper else "Updating existing"
147 |             logger.info("%s paper: %s", action, paper_data['url'])
148 | 
149 |             if paper_data.get("extraction_success"):
150 |                 # Date handling with logging
151 |                 try:
152 |                     publication_date = datetime(
153 |                         paper_data["utc_publication_date_year"],
154 |                         paper_data["utc_publication_date_month"],
155 |                         paper_data["utc_publication_date_day"]
156 |                     )
157 |                 except ValueError as e:
158 |                     logger.warning("Invalid publication date in %s: %s", paper_data['url'], e)
159 |                     publication_date = datetime.now()
160 | 
161 |                 try:
162 |                     submission_date = datetime(
163 |                         paper_data["utc_submission_date_year"],
164 |                         paper_data["utc_submission_date_month"],
165 |                         paper_data["utc_submission_date_day"]
166 |                     )
167 |                 except ValueError as e:
168 |                     logger.warning("Invalid submission date in %s: %s", paper_data['url'], e)
169 |                     submission_date = datetime.now()
170 | 
171 |                 # Create/update the paper entry with metrics
172 |                 paper = Paper(
173 |                     url=paper_data["url"],
174 |                     title=paper_data["paper_title"],
175 |                     authors=paper_data["authors"].split(", "),
176 |                     abstract=paper_data["abstract_body"],
177 |                     pdf_url=paper_data.get("view_pdf_url"),
178 |                     arxiv_url=paper_data.get("view_arxiv_page_url"),
179 |                     github_url=paper_data.get("github_repo_url"),
180 |                     publication_date=publication_date,
181 |                     submission_date=submission_date,
182 |                     upvotes=paper_data.get("number_of_upvotes", 0),
183 |                     comments=paper_data.get("number_of_comments", 0)
184 |                 )
185 |             else:
186 |                 # Handle failed extraction
187 |                 paper = Paper(
188 |                     url=paper_data["url"],
189 |                     extraction_success=False,
190 |                     extraction_error=paper_data.get("extraction_error"),
191 |                     last_extraction_attempt=paper_data["last_extraction_attempt"],
192 |                     notification_sent=paper_data.get("notification_sent", False)
193 |                 )
194 | 
195 |             logger.debug("Merging paper data for %s", paper_data['url'])
196 |             session.merge(paper)
197 |             session.commit()
198 |             logger.info("Successfully %s paper for %s", 
199 |                     'added' if is_new_paper else 'updated', paper_data['url'])
200 |             
201 |             return is_new_paper
202 |         except Exception as e:
203 |             logger.error("Error adding/updating paper %s: %s", paper_data['url'], str(e))
204 |             raise
205 |         finally:
206 |             session.close()
207 | 
208 |     def update_notification_status(self, url: str, status: bool) -> bool:
209 |         """Update the notification status for a paper. Returns True if successful."""
210 |         logger.info("Updating notification status for %s to %s", url, status)
211 |         session = self.session_factory()
212 |         try:
213 |             paper = session.query(Paper).filter(Paper.url == url).first()
214 |             if not paper:
215 |                 logger.error("Paper not found: %s", url)
216 |                 return False
217 |             paper.notification_sent = status
218 |             session.commit()
219 |             logger.info("Successfully updated notification status")
220 |             return True
221 |         except SQLAlchemyError as e:
222 |             session.rollback()
223 |             logger.error("Error updating notification status: %s", str(e))
224 |             return False
225 |         finally:
226 |             session.close()
227 | 
228 |     def get_failed_extractions(self, min_age_hours: int = 1):
229 |         """Get papers that failed extraction and haven't been retried recently."""
230 |         logger.info("Fetching failed extractions older than %d hours", min_age_hours)
231 |         session = self.session_factory()
232 |         try:
233 |             retry_cutoff = datetime.now() - timedelta(hours=min_age_hours)
234 |             papers = session.query(Paper).filter(
235 |                 Paper.extraction_success == False,
236 |                 Paper.last_extraction_attempt < retry_cutoff
237 |             ).all()
238 |             logger.info("Found %d failed extractions eligible for retry", len(papers))
239 |             return papers
240 |         except SQLAlchemyError as e:
241 |             logger.error("Error fetching failed extractions: %s", str(e))
242 |             return []
243 |         finally:
244 |             session.close()
245 | 
246 | 
247 | if __name__ == "__main__":
248 |     from dotenv import load_dotenv
249 |     import os
250 |     
251 |     load_dotenv()
252 |     logger.info("Starting database module directly")
253 |     
254 |     # Initialize database connection
255 |     db = Database(os.getenv("POSTGRES_URL"))
256 |     
257 |     # Create a test paper entry
258 |     test_paper = {
259 |         "url": "https://test.paper/123",
260 |         "paper_title": "Test Paper for Database Module",
261 |         "authors": "John Doe, Jane Smith",
262 |         "abstract_body": "This is a test paper to verify database functionality.",
263 |         "view_pdf_url": "https://test.paper/123/pdf",
264 |         "view_arxiv_page_url": "https://arxiv.org/abs/test.123",
265 |         "github_repo_url": "https://github.com/test/repo",
266 |         "utc_publication_date_year": 2024,
267 |         "utc_publication_date_month": 3,
268 |         "utc_publication_date_day": 15,
269 |         "utc_submission_date_year": 2024,
270 |         "utc_submission_date_month": 3,
271 |         "utc_submission_date_day": 1,
272 |         "number_of_upvotes": 42,
273 |         "number_of_comments": 7
274 |     }
275 | 
276 |     try:
277 |         # Test adding a paper
278 |         logger.info("Testing paper addition...")
279 |         is_new = db.add_paper(test_paper)
280 |         logger.info("Paper added successfully (new: %s)", is_new)
281 | 
282 |         # Test retrieving all papers
283 |         logger.info("Testing paper retrieval...")
284 |         papers = db.get_all_papers()
285 |         logger.info("Retrieved %d papers", len(papers))
286 |         
287 |         logger.info("Test completed successfully! ✅")
288 |         
289 |     except (SQLAlchemyError, ValueError) as e:
290 |         logger.error("Test failed! ❌ Error: %s", str(e))
291 | 
292 | # TODO: stream the DB contents to a Notion database a la Chief AI Officer database
293 | # TODO: make db entries nullable. this will require db migrations.
294 | # TODO: add a column storing wether a post on x has been made for a paper
295 | 


--------------------------------------------------------------------------------
/llm_extract_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "view-in-github",
  7 |         "colab_type": "text"
  8 |       },
  9 |       "source": [
 10 |         "<a href=\"https://colab.research.google.com/github/alexfazio/firecrawl-quickstart/blob/main/llm_extract_tutorial.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 11 |       ]
 12 |     },
 13 |     {
 14 |       "cell_type": "markdown",
 15 |       "metadata": {
 16 |         "id": "intro-section"
 17 |       },
 18 |       "source": [
 19 |         "# Firecrawl LLM Extract Tutorial\n",
 20 |         "\n",
 21 |         "By Alex Fazio (https://twitter.com/alxfazio)\n",
 22 |         "\n",
 23 |         "Github repo: https://github.com/alexfazio/firecrawl-cookbook\n",
 24 |         "\n",
 25 |         "This Jupyter notebook demonstrates how to use Firecrawl's LLM Extract feature to extract structured data from web pages. By the end of this tutorial, you'll be able to:\n",
 26 |         "\n",
 27 |         "1. Set up the Firecrawl environment\n",
 28 |         "2. Extract data using a schema\n",
 29 |         "3. Extract data using prompts without a schema\n",
 30 |         "\n",
 31 |         "This cookbook is designed for developers who want to efficiently extract structured data from web pages using LLMs."
 32 |       ]
 33 |     },
 34 |     {
 35 |       "cell_type": "markdown",
 36 |       "metadata": {
 37 |         "id": "requirements-section"
 38 |       },
 39 |       "source": [
 40 |         "## Requirements\n",
 41 |         "\n",
 42 |         "Before proceeding, ensure you have:\n",
 43 |         "\n",
 44 |         "- **Firecrawl API key**: Required for accessing the Firecrawl service\n",
 45 |         "- Python environment with required packages\n",
 46 |         "\n",
 47 |         "We'll be using the following packages:\n",
 48 |         "- `firecrawl`: For interacting with the Firecrawl API\n",
 49 |         "- `pydantic`: For schema definition"
 50 |       ]
 51 |     },
 52 |     {
 53 |       "cell_type": "markdown",
 54 |       "metadata": {
 55 |         "id": "setup-section"
 56 |       },
 57 |       "source": [
 58 |         "## Setup\n",
 59 |         "\n",
 60 |         "First, let's install the required packages:"
 61 |       ]
 62 |     },
 63 |     {
 64 |       "cell_type": "code",
 65 |       "execution_count": 1,
 66 |       "metadata": {
 67 |         "colab": {
 68 |           "base_uri": "https://localhost:8080/"
 69 |         },
 70 |         "id": "ytS0D_edJQIH",
 71 |         "outputId": "2cf0e258-8ae4-4718-b883-bdd4c5b35230"
 72 |       },
 73 |       "outputs": [
 74 |         {
 75 |           "output_type": "stream",
 76 |           "name": "stdout",
 77 |           "text": [
 78 |             "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/164.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 79 |             "\u001b[?25h"
 80 |           ]
 81 |         }
 82 |       ],
 83 |       "source": [
 84 |         "%pip install firecrawl-py pydantic --quiet"
 85 |       ]
 86 |     },
 87 |     {
 88 |       "cell_type": "markdown",
 89 |       "metadata": {
 90 |         "id": "api-key-section"
 91 |       },
 92 |       "source": [
 93 |         "Next, let's set up our Firecrawl API key:"
 94 |       ]
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "execution_count": 2,
 99 |       "metadata": {
100 |         "colab": {
101 |           "base_uri": "https://localhost:8080/"
102 |         },
103 |         "id": "5S7s7PEmJQII",
104 |         "outputId": "c42bd26a-3f6b-47c5-9391-3d3dc306d033"
105 |       },
106 |       "outputs": [
107 |         {
108 |           "name": "stdout",
109 |           "output_type": "stream",
110 |           "text": [
111 |             "Enter your Firecrawl API key: ··········\n"
112 |           ]
113 |         }
114 |       ],
115 |       "source": [
116 |         "from getpass import getpass\n",
117 |         "api_key = getpass(\"Enter your Firecrawl API key: \")"
118 |       ]
119 |     },
120 |     {
121 |       "cell_type": "markdown",
122 |       "metadata": {
123 |         "id": "schema-extraction-section"
124 |       },
125 |       "source": [
126 |         "## Extracting Data with Schema\n",
127 |         "\n",
128 |         "Let's start by importing the required libraries and defining our schema for extraction:"
129 |       ]
130 |     },
131 |     {
132 |       "cell_type": "code",
133 |       "execution_count": 3,
134 |       "metadata": {
135 |         "colab": {
136 |           "base_uri": "https://localhost:8080/"
137 |         },
138 |         "id": "KgQoSEgdJQII",
139 |         "outputId": "452e9c86-3c14-4c28-ef9a-e4ccedc17a4b"
140 |       },
141 |       "outputs": [
142 |         {
143 |           "output_type": "stream",
144 |           "name": "stdout",
145 |           "text": [
146 |             "{'company_mission': \"Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to\", 'supports_sso': True, 'is_open_source': False, 'is_in_yc': True}\n"
147 |           ]
148 |         }
149 |       ],
150 |       "source": [
151 |         "from firecrawl import FirecrawlApp\n",
152 |         "from pydantic import BaseModel, Field\n",
153 |         "\n",
154 |         "# Initialize the FirecrawlApp with your API key\n",
155 |         "app = FirecrawlApp(api_key=api_key)\n",
156 |         "\n",
157 |         "class ExtractSchema(BaseModel):\n",
158 |         "    company_mission: str\n",
159 |         "    supports_sso: bool\n",
160 |         "    is_open_source: bool\n",
161 |         "    is_in_yc: bool\n",
162 |         "\n",
163 |         "data = app.scrape_url('https://docs.firecrawl.dev/', {\n",
164 |         "    'formats': ['extract'],\n",
165 |         "    'extract': {\n",
166 |         "        'schema': ExtractSchema.model_json_schema(),\n",
167 |         "    }\n",
168 |         "})\n",
169 |         "\n",
170 |         "print(data['extract'])"
171 |       ]
172 |     },
173 |     {
174 |       "cell_type": "markdown",
175 |       "metadata": {
176 |         "id": "prompt-extraction-section"
177 |       },
178 |       "source": [
179 |         "## Extracting Data without Schema\n",
180 |         "\n",
181 |         "Firecrawl also supports extraction using just a prompt, allowing the LLM to determine the structure:"
182 |       ]
183 |     },
184 |     {
185 |       "cell_type": "code",
186 |       "execution_count": 7,
187 |       "metadata": {
188 |         "colab": {
189 |           "base_uri": "https://localhost:8080/"
190 |         },
191 |         "id": "JU57r_bfJQIJ",
192 |         "outputId": "49cc9378-6854-4565-dfd6-00cd9e2177c7"
193 |       },
194 |       "outputs": [
195 |         {
196 |           "output_type": "stream",
197 |           "name": "stdout",
198 |           "text": [
199 |             "{\"success\":true,\"data\":{\"extract\":{\"company_mission\":\"Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to\"},\"metadata\":{\"title\":\"Quickstart | Firecrawl\",\"description\":\"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\"language\":\"en\",\"ogLocaleAlternate\":[],\"viewport\":\"width=device-width\",\"msapplication-config\":\"https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/_generated/favicon/browserconfig.xml?v=3\",\"apple-mobile-web-app-title\":\"Firecrawl Docs\",\"application-name\":\"Firecrawl Docs\",\"msapplication-TileColor\":\"#000\",\"theme-color\":\"#ffffff\",\"charset\":\"utf-8\",\"og:type\":\"website\",\"og:site_name\":\"Firecrawl Docs\",\"twitter:card\":\"summary_large_image\",\"og:title\":\"Quickstart | Firecrawl\",\"twitter:title\":\"Firecrawl Docs\",\"og:image\":\"/images/og.png\",\"twitter:image\":\"/images/og.png\",\"og:description\":\"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\"og:url\":\"https://docs.firecrawl.dev/introduction\",\"next-head-count\":\"25\",\"sourceURL\":\"https://docs.firecrawl.dev/\",\"statusCode\":200}}}"
200 |           ]
201 |         }
202 |       ],
203 |       "source": [
204 |         "# Method 1: Using curl with a properly formatted command string\n",
205 |         "curl_command = f'''\n",
206 |         "curl -X POST https://api.firecrawl.dev/v1/scrape \\\n",
207 |         "  -H 'Content-Type: application/json' \\\n",
208 |         "  -H 'Authorization: Bearer {api_key}' \\\n",
209 |         "  -d '{{\n",
210 |         "    \"url\": \"https://docs.firecrawl.dev/\",\n",
211 |         "    \"formats\": [\"extract\"],\n",
212 |         "    \"extract\": {{\n",
213 |         "      \"prompt\": \"Extract the company mission from the page.\"\n",
214 |         "    }}\n",
215 |         "  }}'\n",
216 |         "'''\n",
217 |         "\n",
218 |         "!{curl_command}"
219 |       ]
220 |     },
221 |     {
222 |       "cell_type": "code",
223 |       "source": [
224 |         "# Method 2: Alternative approach using requests library\n",
225 |         "import requests\n",
226 |         "import json\n",
227 |         "\n",
228 |         "url = \"https://api.firecrawl.dev/v1/scrape\"\n",
229 |         "headers = {\n",
230 |         "    \"Content-Type\": \"application/json\",\n",
231 |         "    \"Authorization\": f\"Bearer {api_key}\"\n",
232 |         "}\n",
233 |         "payload = {\n",
234 |         "    \"url\": \"https://docs.firecrawl.dev/\",\n",
235 |         "    \"formats\": [\"extract\"],\n",
236 |         "    \"extract\": {\n",
237 |         "        \"prompt\": \"Extract the company mission from the page.\"\n",
238 |         "    }\n",
239 |         "}\n",
240 |         "\n",
241 |         "response = requests.post(url, headers=headers, json=payload)\n",
242 |         "print(json.dumps(response.json(), indent=2))"
243 |       ],
244 |       "metadata": {
245 |         "colab": {
246 |           "base_uri": "https://localhost:8080/"
247 |         },
248 |         "id": "SJZydfRNT1Sc",
249 |         "outputId": "3aed84ce-26f3-4714-b625-b73944031788"
250 |       },
251 |       "execution_count": 8,
252 |       "outputs": [
253 |         {
254 |           "output_type": "stream",
255 |           "name": "stdout",
256 |           "text": [
257 |             "{\n",
258 |             "  \"success\": true,\n",
259 |             "  \"data\": {\n",
260 |             "    \"extract\": {\n",
261 |             "      \"company_mission\": \"Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to\"\n",
262 |             "    },\n",
263 |             "    \"metadata\": {\n",
264 |             "      \"title\": \"Quickstart | Firecrawl\",\n",
265 |             "      \"description\": \"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\n",
266 |             "      \"language\": \"en\",\n",
267 |             "      \"ogLocaleAlternate\": [],\n",
268 |             "      \"viewport\": \"width=device-width\",\n",
269 |             "      \"msapplication-config\": \"https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/_generated/favicon/browserconfig.xml?v=3\",\n",
270 |             "      \"apple-mobile-web-app-title\": \"Firecrawl Docs\",\n",
271 |             "      \"application-name\": \"Firecrawl Docs\",\n",
272 |             "      \"msapplication-TileColor\": \"#000\",\n",
273 |             "      \"theme-color\": \"#ffffff\",\n",
274 |             "      \"charset\": \"utf-8\",\n",
275 |             "      \"og:type\": \"website\",\n",
276 |             "      \"og:site_name\": \"Firecrawl Docs\",\n",
277 |             "      \"twitter:card\": \"summary_large_image\",\n",
278 |             "      \"og:title\": \"Quickstart | Firecrawl\",\n",
279 |             "      \"twitter:title\": \"Firecrawl Docs\",\n",
280 |             "      \"og:image\": \"/images/og.png\",\n",
281 |             "      \"twitter:image\": \"/images/og.png\",\n",
282 |             "      \"og:description\": \"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\n",
283 |             "      \"og:url\": \"https://docs.firecrawl.dev/introduction\",\n",
284 |             "      \"next-head-count\": \"25\",\n",
285 |             "      \"sourceURL\": \"https://docs.firecrawl.dev/\",\n",
286 |             "      \"statusCode\": 200\n",
287 |             "    }\n",
288 |             "  }\n",
289 |             "}\n"
290 |           ]
291 |         }
292 |       ]
293 |     },
294 |     {
295 |       "cell_type": "markdown",
296 |       "metadata": {
297 |         "id": "closing-section"
298 |       },
299 |       "source": [
300 |         "## Next Steps\n",
301 |         "\n",
302 |         "You've now learned how to:\n",
303 |         "1. Set up Firecrawl for data extraction\n",
304 |         "2. Extract data using a defined schema\n",
305 |         "3. Extract data using prompts without a schema\n",
306 |         "\n",
307 |         "For more information about the extract format and additional features, visit the [Firecrawl documentation](https://docs.firecrawl.dev/features/extract)."
308 |       ]
309 |     }
310 |   ],
311 |   "metadata": {
312 |     "kernelspec": {
313 |       "display_name": "Python 3",
314 |       "language": "python",
315 |       "name": "python3"
316 |     },
317 |     "language_info": {
318 |       "codemirror_mode": {
319 |         "name": "ipython",
320 |         "version": 3
321 |       },
322 |       "file_extension": ".py",
323 |       "mimetype": "text/x-python",
324 |       "name": "python",
325 |       "nbconvert_exporter": "python",
326 |       "pygments_lexer": "ipython3",
327 |       "version": "3.8.0"
328 |     },
329 |     "colab": {
330 |       "provenance": [],
331 |       "include_colab_link": true
332 |     }
333 |   },
334 |   "nbformat": 4,
335 |   "nbformat_minor": 0
336 | }


--------------------------------------------------------------------------------
/crawl_and_extract_with_openai_o1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "view-in-github",
  7 |         "colab_type": "text"
  8 |       },
  9 |       "source": [
 10 |         "<a href=\"https://colab.research.google.com/github/alexfazio/firecrawl-cookbook/blob/main/openai_o1_firecrawl_integration.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 11 |       ]
 12 |     },
 13 |     {
 14 |       "cell_type": "markdown",
 15 |       "metadata": {
 16 |         "id": "ULHdYNR8QlPF"
 17 |       },
 18 |       "source": [
 19 |         "# Integrating OpenAI's o1 Reasoning Models with Firecrawl: A Step-by-Step Guide\n",
 20 |         "\n",
 21 |         "By Alex Fazio (https://twitter.com/alxfazio)\n",
 22 |         "\n",
 23 |         "Github repo: https://github.com/alexfazio/firecrawl-cookbook\n",
 24 |         "\n",
 25 |         "OpenAI has recently unveiled its o1 series models, marking a significant leap in the realm of complex reasoning with AI. These models are designed to \"think before they answer,\" producing extensive internal chains of thought before responding. In this guide, we'll explore how to integrate these powerful models into your applications, with a practical example of crawling a website using the o1-preview model.\n",
 26 |         "\n",
 27 |         "**This Jupyter notebook** demonstrates how to integrate OpenAI's o1 reasoning models with Firecrawl technology to perform complex tasks like crawling a website and extracting specific information.\n",
 28 |         "\n",
 29 |         "By the end of this notebook, you'll be able to:\n",
 30 |         "\n",
 31 |         "- Set up the Firecrawl and OpenAI environments\n",
 32 |         "- Use the o1-preview model to enhance the crawling process\n",
 33 |         "- Crawl a website and generate a list of relevant URLs based on a given objective\n",
 34 |         "- Extract content from crawled pages in Markdown\n",
 35 |         "- Evaluate the extracted content using the o1 reasoning model to check if it meets the specified objective\n",
 36 |         "\n",
 37 |         "This guide is designed for developers and data scientists who want to leverage advanced AI reasoning capabilities and web crawling technology to efficiently gather and analyze information from the web.\n",
 38 |         "\n",
 39 |         "## Requirements\n",
 40 |         "\n",
 41 |         "Before proceeding, ensure you have the following:\n",
 42 |         "\n",
 43 |         "- Firecrawl API key: Essential for accessing the Firecrawl service\n",
 44 |         "- OpenAI API key: Required for using the o1 reasoning models"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "metadata": {
 50 |         "id": "sypSu88zQlPG"
 51 |       },
 52 |       "source": [
 53 |         "## Introduction to o1 Models\n",
 54 |         "\n",
 55 |         "The o1 models are large language models trained with reinforcement learning to excel in complex reasoning tasks. There are two models available:\n",
 56 |         "\n",
 57 |         "- **o1-preview**: An early preview designed for reasoning about hard problems using broad general knowledge.\n",
 58 |         "- **o1-mini**: A faster, cost-effective version ideal for coding, math, and science tasks that don't require extensive general knowledge.\n",
 59 |         "\n",
 60 |         "While these models offer significant advancements, they are not intended to replace GPT-4o in all use cases. If your application requires image inputs, function calling, or consistent fast response times, GPT-4o and GPT-4o mini remain the optimal choices."
 61 |       ]
 62 |     },
 63 |     {
 64 |       "cell_type": "markdown",
 65 |       "metadata": {
 66 |         "id": "DlPHD-WNQlPG"
 67 |       },
 68 |       "source": [
 69 |         "## Prerequisites\n",
 70 |         "\n",
 71 |         "First, let's install the required libraries:"
 72 |       ]
 73 |     },
 74 |     {
 75 |       "cell_type": "code",
 76 |       "execution_count": 1,
 77 |       "metadata": {
 78 |         "colab": {
 79 |           "base_uri": "https://localhost:8080/"
 80 |         },
 81 |         "id": "METpmDKFQlPH",
 82 |         "outputId": "b32a1fc8-b6ee-4268-e4b3-719a770d5a03"
 83 |       },
 84 |       "source": [
 85 |         "%pip install -q firecrawl-py openai python-dotenv"
 86 |       ],
 87 |       "outputs": [
 88 |         {
 89 |           "output_type": "stream",
 90 |           "name": "stdout",
 91 |           "text": [
 92 |             "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/386.9 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m378.9/386.9 kB\u001b[0m \u001b[31m20.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m386.9/386.9 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 93 |             "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/76.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 94 |             "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/78.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.0/78.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 95 |             "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/325.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m325.2/325.2 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 96 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 97 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 98 |             "\u001b[?25h"
 99 |           ]
100 |         }
101 |       ]
102 |     },
103 |     {
104 |       "cell_type": "markdown",
105 |       "metadata": {
106 |         "id": "PzI4OO5EQlPH"
107 |       },
108 |       "source": [
109 |         "## Step 1: Import Necessary Libraries"
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "code",
114 |       "execution_count": null,
115 |       "metadata": {
116 |         "id": "nEBEShvoQlPH"
117 |       },
118 |       "source": [
119 |         "import os\n",
120 |         "from firecrawl import FirecrawlApp\n",
121 |         "import json\n",
122 |         "from dotenv import load_dotenv\n",
123 |         "from openai import OpenAI"
124 |       ],
125 |       "outputs": []
126 |     },
127 |     {
128 |       "cell_type": "markdown",
129 |       "metadata": {
130 |         "id": "HKWKvisAQlPH"
131 |       },
132 |       "source": [
133 |         "## Step 2: Load Environment Variables\n",
134 |         "\n",
135 |         "For Google Colab, we'll set the environment variables directly instead of using a .env file. In practice, you should never expose your API keys in your notebook."
136 |       ]
137 |     },
138 |     {
139 |       "cell_type": "code",
140 |       "execution_count": null,
141 |       "metadata": {
142 |         "id": "G8vBvKDiQlPI"
143 |       },
144 |       "source": [
145 |         "# For development, use environment variables\n",
146 |         "os.environ['FIRECRAWL_API_KEY'] = 'your_firecrawl_api_key_here'\n",
147 |         "os.environ['OPENAI_API_KEY'] = 'your_openai_api_key_here'\n",
148 |         "\n",
149 |         "# Retrieve API keys from environment variables\n",
150 |         "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
151 |         "openai_api_key = os.getenv(\"OPENAI_API_KEY\")"
152 |       ],
153 |       "outputs": []
154 |     },
155 |     {
156 |       "cell_type": "markdown",
157 |       "metadata": {
158 |         "id": "_jNb62ZnQlPI"
159 |       },
160 |       "source": [
161 |         "## Step 3: Initialize the FirecrawlApp and OpenAI Client"
162 |       ]
163 |     },
164 |     {
165 |       "cell_type": "code",
166 |       "execution_count": null,
167 |       "metadata": {
168 |         "id": "xRUYdpUsQlPI"
169 |       },
170 |       "source": [
171 |         "# Initialize the FirecrawlApp and OpenAI client\n",
172 |         "app = FirecrawlApp(api_key=firecrawl_api_key)\n",
173 |         "client = OpenAI(api_key=openai_api_key)"
174 |       ],
175 |       "outputs": []
176 |     },
177 |     {
178 |       "cell_type": "markdown",
179 |       "metadata": {
180 |         "id": "ZYIFGPKwQlPI"
181 |       },
182 |       "source": [
183 |         "## Step 4: Define the Objective and URL"
184 |       ]
185 |     },
186 |     {
187 |       "cell_type": "code",
188 |       "execution_count": null,
189 |       "metadata": {
190 |         "id": "NcEE8cljQlPI"
191 |       },
192 |       "source": [
193 |         "url = \"https://example.com\"\n",
194 |         "objective = \"Find the contact email for customer support\""
195 |       ],
196 |       "outputs": []
197 |     },
198 |     {
199 |       "cell_type": "markdown",
200 |       "metadata": {
201 |         "id": "0oYsHFCzQlPI"
202 |       },
203 |       "source": [
204 |         "## Step 5: Determine the Search Parameter Using o1-preview"
205 |       ]
206 |     },
207 |     {
208 |       "cell_type": "code",
209 |       "execution_count": null,
210 |       "metadata": {
211 |         "id": "jIvi4bTRQlPI"
212 |       },
213 |       "source": [
214 |         "map_prompt = f\"\"\"\n",
215 |         "The map function generates a list of URLs from a website and accepts a search parameter. Based on the objective: {objective}, suggest a 1-2 word search parameter to find the needed information. Only respond with 1-2 words.\n",
216 |         "\"\"\"\n",
217 |         "\n",
218 |         "# OpenAI API call\n",
219 |         "completion = client.chat.completions.create(\n",
220 |         "    model=\"o1-preview\",\n",
221 |         "    messages=[\n",
222 |         "        {\"role\": \"user\", \"content\": map_prompt}\n",
223 |         "    ]\n",
224 |         ")\n",
225 |         "\n",
226 |         "map_search_parameter = completion.choices[0].message.content.strip()\n",
227 |         "print(f\"Search parameter: {map_search_parameter}\")"
228 |       ],
229 |       "outputs": []
230 |     },
231 |     {
232 |       "cell_type": "markdown",
233 |       "metadata": {
234 |         "id": "DrYCmYtEQlPI"
235 |       },
236 |       "source": [
237 |         "## Step 6: Map the Website Using the Search Parameter"
238 |       ]
239 |     },
240 |     {
241 |       "cell_type": "code",
242 |       "execution_count": null,
243 |       "metadata": {
244 |         "id": "Zir5O-HbQlPI"
245 |       },
246 |       "source": [
247 |         "map_website = app.map_url(url, params={\"search\": map_search_parameter})\n",
248 |         "print(\"Mapped URLs:\", map_website)"
249 |       ],
250 |       "outputs": []
251 |     },
252 |     {
253 |       "cell_type": "markdown",
254 |       "metadata": {
255 |         "id": "ejGIGykTQlPI"
256 |       },
257 |       "source": [
258 |         "## Step 7: Scrape the Top Pages and Check for the Objective"
259 |       ]
260 |     },
261 |     {
262 |       "cell_type": "code",
263 |       "execution_count": null,
264 |       "metadata": {
265 |         "id": "MZHYRT36QlPI"
266 |       },
267 |       "source": [
268 |         "# Get top 3 links\n",
269 |         "top_links = map_website[:3] if isinstance(map_website, list) else []\n",
270 |         "\n",
271 |         "for link in top_links:\n",
272 |         "    # Scrape the page\n",
273 |         "    scrape_result = app.scrape_url(link, params={'formats': ['markdown']})\n",
274 |         "\n",
275 |         "    # Check if objective is met\n",
276 |         "    check_prompt = f\"\"\"\n",
277 |         "    Given the following scraped content and objective, determine if the objective is met with high confidence.\n",
278 |         "    If it is, extract the relevant information in a simple and concise JSON format.\n",
279 |         "    If the objective is not met with high confidence, respond with 'Objective not met'.\n",
280 |         "\n",
281 |         "    Objective: {objective}\n",
282 |         "    Scraped content: {scrape_result['markdown']}\n",
283 |         "    \"\"\"\n",
284 |         "\n",
285 |         "    completion = client.chat.completions.create(\n",
286 |         "        model=\"o1-preview\",\n",
287 |         "        messages=[\n",
288 |         "            {\"role\": \"user\", \"content\": check_prompt}\n",
289 |         "        ]\n",
290 |         "    )\n",
291 |         "\n",
292 |         "    result = completion.choices[0].message.content.strip()\n",
293 |         "\n",
294 |         "    if result != \"Objective not met\":\n",
295 |         "        try:\n",
296 |         "            extracted_info = json.loads(result)\n",
297 |         "            break\n",
298 |         "        except json.JSONDecodeError:\n",
299 |         "            continue\n",
300 |         "else:\n",
301 |         "    extracted_info = None"
302 |       ],
303 |       "outputs": []
304 |     },
305 |     {
306 |       "cell_type": "markdown",
307 |       "metadata": {
308 |         "id": "JNBIwUbhQlPJ"
309 |       },
310 |       "source": [
311 |         "## Step 8: Display the Extracted Information"
312 |       ]
313 |     },
314 |     {
315 |       "cell_type": "code",
316 |       "execution_count": null,
317 |       "metadata": {
318 |         "id": "Ivv4ze9OQlPJ"
319 |       },
320 |       "source": [
321 |         "if extracted_info:\n",
322 |         "    print(\"Extracted Information:\")\n",
323 |         "    print(json.dumps(extracted_info, indent=2))\n",
324 |         "else:\n",
325 |         "    print(\"Objective not met with the available content.\")"
326 |       ],
327 |       "outputs": []
328 |     },
329 |     {
330 |       "cell_type": "markdown",
331 |       "metadata": {
332 |         "id": "39Sb3XlFQlPJ"
333 |       },
334 |       "source": [
335 |         "## Conclusion\n",
336 |         "\n",
337 |         "In this notebook, we've explored how to integrate OpenAI's new o1 reasoning models into your applications to perform complex tasks like crawling a website and extracting specific information. The o1 models showcase impressive capabilities in reasoning and problem-solving, making them valuable tools for developers tackling challenging AI tasks.\n",
338 |         "\n",
339 |         "Whether you're working on advanced coding problems, mathematical computations, or intricate scientific queries, the o1 models can significantly enhance your application's reasoning abilities.\n",
340 |         "\n",
341 |         "Happy coding!"
342 |       ]
343 |     }
344 |   ],
345 |   "metadata": {
346 |     "kernelspec": {
347 |       "display_name": "Python 3",
348 |       "language": "python",
349 |       "name": "python3"
350 |     },
351 |     "language_info": {
352 |       "codemirror_mode": {
353 |         "name": "ipython",
354 |         "version": 3
355 |       },
356 |       "file_extension": ".py",
357 |       "mimetype": "text/x-python",
358 |       "name": "python",
359 |       "nbconvert_exporter": "python",
360 |       "pygments_lexer": "ipython3",
361 |       "version": "3.8.0"
362 |     },
363 |     "colab": {
364 |       "provenance": [],
365 |       "include_colab_link": true
366 |     }
367 |   },
368 |   "nbformat": 4,
369 |   "nbformat_minor": 0
370 | }
371 | 


--------------------------------------------------------------------------------
/crawl_and_extract_with_xai_grok.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "private_outputs": true,
  7 |       "provenance": [],
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/alexfazio/firecrawl-quickstart/blob/main/web_crawler_grok_firecrawl.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "metadata": {
 32 |         "id": "intro-section"
 33 |       },
 34 |       "source": [
 35 |         "# Building a Web Crawler with Grok-2 and Firecrawl\n",
 36 |         "\n",
 37 |         "By Alex Fazio (https://twitter.com/alxfazio)\n",
 38 |         "\n",
 39 |         "Github repo: https://github.com/alexfazio/firecrawl-cookbook\n",
 40 |         "\n",
 41 |         "This Jupyter notebook demonstrates how to combine Grok-2's language model capabilities with Firecrawl's web scraping features to build an intelligent web crawler that can extract structured data from websites.\n",
 42 |         "\n",
 43 |         "By the end of this notebook, you'll be able to:\n",
 44 |         "\n",
 45 |         "1. Set up the Grok-2 and Firecrawl environment\n",
 46 |         "2. Build a targeted web crawler that understands content\n",
 47 |         "3. Extract and process structured data from websites\n",
 48 |         "4. Export the processed content in JSON format\n",
 49 |         "\n",
 50 |         "This cookbook is designed for developers and data scientists who want to build advanced web crawlers with AI-powered content understanding."
 51 |       ]
 52 |     },
 53 |     {
 54 |       "cell_type": "markdown",
 55 |       "metadata": {
 56 |         "id": "9rQmTgiMVk5X"
 57 |       },
 58 |       "source": [
 59 |         "## Setup\n",
 60 |         "\n",
 61 |         "First, let's install the required packages:"
 62 |       ]
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "execution_count": null,
 67 |       "metadata": {
 68 |         "id": "zBE3KvuKVk5X"
 69 |       },
 70 |       "outputs": [],
 71 |       "source": [
 72 |         "%pip install firecrawl-py requests python-dotenv --quiet"
 73 |       ]
 74 |     },
 75 |     {
 76 |       "cell_type": "code",
 77 |       "execution_count": null,
 78 |       "metadata": {
 79 |         "id": "q2GWsM_gVk5X"
 80 |       },
 81 |       "outputs": [],
 82 |       "source": [
 83 |         "import os\n",
 84 |         "import json\n",
 85 |         "import requests\n",
 86 |         "from dotenv import load_dotenv\n",
 87 |         "from firecrawl import FirecrawlApp"
 88 |       ]
 89 |     },
 90 |     {
 91 |       "cell_type": "markdown",
 92 |       "metadata": {
 93 |         "id": "VqpYy-sxVk5Y"
 94 |       },
 95 |       "source": [
 96 |         "## Initialize Environment\n",
 97 |         "\n",
 98 |         "Enter your API keys securely:"
 99 |       ]
100 |     },
101 |     {
102 |       "cell_type": "code",
103 |       "execution_count": null,
104 |       "metadata": {
105 |         "id": "Wrec0L1sVk5Y"
106 |       },
107 |       "outputs": [],
108 |       "source": [
109 |         "from getpass import getpass\n",
110 |         "\n",
111 |         "# Securely get API keys\n",
112 |         "grok_api_key = getpass(\"Enter your Grok-2 API key: \")\n",
113 |         "firecrawl_api_key = getpass(\"Enter your Firecrawl API key: \")\n",
114 |         "\n",
115 |         "# Initialize FirecrawlApp\n",
116 |         "app = FirecrawlApp(api_key=firecrawl_api_key)"
117 |       ]
118 |     },
119 |     {
120 |       "cell_type": "markdown",
121 |       "metadata": {
122 |         "id": "uiFw3MfjVk5Y"
123 |       },
124 |       "source": [
125 |         "## Define Grok-2 API Interaction\n",
126 |         "\n",
127 |         "Let's create a function to handle interactions with the Grok-2 API, including comprehensive error handling and debugging information:"
128 |       ]
129 |     },
130 |     {
131 |       "cell_type": "code",
132 |       "execution_count": null,
133 |       "metadata": {
134 |         "id": "PTc5bc85Vk5Y"
135 |       },
136 |       "outputs": [],
137 |       "source": [
138 |         "def grok_completion(prompt):\n",
139 |         "    url = \"https://api.x.ai/v1/chat/completions\"\n",
140 |         "    headers = {\n",
141 |         "        \"Content-Type\": \"application/json\",\n",
142 |         "        \"Authorization\": f\"Bearer {grok_api_key}\"\n",
143 |         "    }\n",
144 |         "    data = {\n",
145 |         "        \"messages\": [\n",
146 |         "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
147 |         "            {\"role\": \"user\", \"content\": prompt}\n",
148 |         "        ],\n",
149 |         "        \"model\": \"grok-beta\",\n",
150 |         "        \"stream\": False,\n",
151 |         "        \"temperature\": 0\n",
152 |         "    }\n",
153 |         "\n",
154 |         "    try:\n",
155 |         "        response = requests.post(url, headers=headers, json=data)\n",
156 |         "        print(f\"\\nAPI Response Status Code: {response.status_code}\")\n",
157 |         "\n",
158 |         "        if response.status_code != 200:\n",
159 |         "            print(f\"Error Response: {response.text}\")\n",
160 |         "            return None\n",
161 |         "\n",
162 |         "        response_data = response.json()\n",
163 |         "        print(\"\\nFull API Response:\")\n",
164 |         "        print(json.dumps(response_data, indent=2))\n",
165 |         "\n",
166 |         "        if 'choices' not in response_data:\n",
167 |         "            print(\"\\nWarning: 'choices' key not found in response\")\n",
168 |         "            print(\"Available keys:\", list(response_data.keys()))\n",
169 |         "            return None\n",
170 |         "\n",
171 |         "        if not response_data['choices']:\n",
172 |         "            print(\"\\nWarning: 'choices' array is empty\")\n",
173 |         "            return None\n",
174 |         "\n",
175 |         "        choice = response_data['choices'][0]\n",
176 |         "        if 'message' not in choice:\n",
177 |         "            print(\"\\nWarning: 'message' key not found in first choice\")\n",
178 |         "            print(\"Available keys in choice:\", list(choice.keys()))\n",
179 |         "            return None\n",
180 |         "\n",
181 |         "        if 'content' not in choice['message']:\n",
182 |         "            print(\"\\nWarning: 'content' key not found in message\")\n",
183 |         "            print(\"Available keys in message:\", list(choice['message'].keys()))\n",
184 |         "            return None\n",
185 |         "\n",
186 |         "        return choice['message']['content']\n",
187 |         "\n",
188 |         "    except requests.exceptions.RequestException as e:\n",
189 |         "        print(f\"\\nRequest Error: {str(e)}\")\n",
190 |         "        return None\n",
191 |         "    except json.JSONDecodeError as e:\n",
192 |         "        print(f\"\\nJSON Decode Error: {str(e)}\")\n",
193 |         "        print(\"Raw Response:\", response.text)\n",
194 |         "        return None\n",
195 |         "    except Exception as e:\n",
196 |         "        print(f\"\\nUnexpected Error: {str(e)}\")\n",
197 |         "        return None"
198 |       ]
199 |     },
200 |     {
201 |       "cell_type": "markdown",
202 |       "metadata": {
203 |         "id": "Wecuh9TbVk5Y"
204 |       },
205 |       "source": [
206 |         "## Website Crawling Functions\n",
207 |         "\n",
208 |         "This function combines Grok-2's understanding with Firecrawl's search capabilities to find relevant pages. It:\n",
209 |         "\n",
210 |         "1. Uses Grok-2 to distill the user's objective into a focused search term\n",
211 |         "2. Enforces strict formatting rules for consistent search terms\n",
212 |         "3. Cleans and normalizes the search output\n",
213 |         "4. Uses Firecrawl's map endpoint to discover relevant pages\n",
214 |         "\n",
215 |         "The function takes a broad objective (e.g., \"Find articles about startup investments\") and converts it into an optimized search term (e.g., \"startup funding\") to ensure targeted results.\n",
216 |         "\n",
217 |         "Note: The function limits search terms to 2 words maximum for optimal performance with Firecrawl's search algorithm."
218 |       ]
219 |     },
220 |     {
221 |       "cell_type": "code",
222 |       "execution_count": null,
223 |       "metadata": {
224 |         "id": "cGA5X6PDVk5Y"
225 |       },
226 |       "outputs": [],
227 |       "source": [
228 |         "def find_relevant_pages(objective, url):\n",
229 |         "    prompt = f\"\"\"Based on the objective '{objective}', provide ONLY a 1-2 word search term to locate relevant information on the website.\n",
230 |         "\n",
231 |         "Rules:\n",
232 |         "- Return ONLY the search term, nothing else\n",
233 |         "- Maximum 2 words\n",
234 |         "- No punctuation or formatting\n",
235 |         "- No explanatory text\"\"\"\n",
236 |         "\n",
237 |         "    search_term = grok_completion(prompt)\n",
238 |         "\n",
239 |         "    if search_term is None:\n",
240 |         "        print(\"Failed to get search term from Grok-2 API\")\n",
241 |         "        return []\n",
242 |         "\n",
243 |         "    # Clean up the search term\n",
244 |         "    search_term = search_term.strip().replace('\"', '').replace('*', '')\n",
245 |         "    words = search_term.split()\n",
246 |         "    if len(words) > 2:\n",
247 |         "        search_term = \" \".join(words[:2])\n",
248 |         "\n",
249 |         "    print(f\"Using search term: '{search_term}'\")\n",
250 |         "\n",
251 |         "    try:\n",
252 |         "        map_result = app.map_url(url, params={\"search\": search_term})\n",
253 |         "        return map_result.get(\"links\", [])\n",
254 |         "    except Exception as e:\n",
255 |         "        print(f\"Error mapping URL: {str(e)}\")\n",
256 |         "        return []"
257 |       ]
258 |     },
259 |     {
260 |       "cell_type": "markdown",
261 |       "source": [
262 |         "## Content Extraction and Processing\n",
263 |         "\n",
264 |         "This function handles the extraction and intelligent processing of content from each webpage. It:\n",
265 |         "\n",
266 |         "1. Scrapes content from each relevant page\n",
267 |         "2. Uses Grok-2 to analyze the content against our objective\n",
268 |         "3. Extracts structured data in JSON format\n",
269 |         "4. Handles various edge cases and errors\n",
270 |         "\n",
271 |         "The function processes up to 3 pages and returns the first successful match, using Grok-2 to determine relevance and extract specific data points."
272 |       ],
273 |       "metadata": {
274 |         "id": "XSuxErSsYH1L"
275 |       }
276 |     },
277 |     {
278 |       "cell_type": "code",
279 |       "execution_count": null,
280 |       "metadata": {
281 |         "id": "lffHeKe-Vk5Y"
282 |       },
283 |       "outputs": [],
284 |       "source": [
285 |         "def extract_data_from_pages(links, objective):\n",
286 |         "    for link in links[:3]:\n",
287 |         "        try:\n",
288 |         "            print(f\"\\nProcessing link: {link}\")\n",
289 |         "            scrape_result = app.scrape_url(link, params={'formats': ['markdown']})\n",
290 |         "            content = scrape_result.get('markdown', '')\n",
291 |         "\n",
292 |         "            if not content:\n",
293 |         "                print(\"No content extracted from page\")\n",
294 |         "                continue\n",
295 |         "\n",
296 |         "            prompt = f\"\"\"Given the following content, extract the information related to the objective '{objective}' in JSON format. If not found, reply 'Objective not met'.\n",
297 |         "\n",
298 |         "Content: {content}\n",
299 |         "\n",
300 |         "Remember:\n",
301 |         "- Only return JSON if the objective is met.\n",
302 |         "- Do not include any extra text or markdown formatting.\n",
303 |         "- Do not wrap the JSON in code blocks.\n",
304 |         "\"\"\"\n",
305 |         "            result = grok_completion(prompt)\n",
306 |         "\n",
307 |         "            if result is None:\n",
308 |         "                print(\"Failed to get response from Grok-2 API\")\n",
309 |         "                continue\n",
310 |         "\n",
311 |         "            result = result.strip()\n",
312 |         "\n",
313 |         "            # Handle case where response is wrapped in code blocks\n",
314 |         "            if result.startswith(\"```\") and result.endswith(\"```\"):\n",
315 |         "                # Remove the code block markers and any language identifier\n",
316 |         "                result = result.split(\"\\n\", 1)[1].rsplit(\"\\n\", 1)[0]\n",
317 |         "\n",
318 |         "            if result != \"Objective not met\":\n",
319 |         "                try:\n",
320 |         "                    data = json.loads(result)\n",
321 |         "                    return data\n",
322 |         "                except json.JSONDecodeError as e:\n",
323 |         "                    print(f\"Error parsing JSON response: {str(e)}\")\n",
324 |         "                    print(\"Raw response:\", result)\n",
325 |         "                    continue\n",
326 |         "            else:\n",
327 |         "                print(\"Objective not met for this page\")\n",
328 |         "\n",
329 |         "        except Exception as e:\n",
330 |         "            print(f\"Error processing page: {str(e)}\")\n",
331 |         "            continue\n",
332 |         "\n",
333 |         "    return None"
334 |       ]
335 |     },
336 |     {
337 |       "cell_type": "markdown",
338 |       "metadata": {
339 |         "id": "hT7hScXvVk5Y"
340 |       },
341 |       "source": [
342 |         "## Main Execution\n",
343 |         "\n",
344 |         "Let's create and run the main function that ties everything together:"
345 |       ]
346 |     },
347 |     {
348 |       "cell_type": "code",
349 |       "execution_count": null,
350 |       "metadata": {
351 |         "id": "C_wDgTBOVk5Y"
352 |       },
353 |       "outputs": [],
354 |       "source": [
355 |         "import pprint\n",
356 |         "\n",
357 |         "def main():\n",
358 |         "    url = input(\"Enter the website URL to crawl: \")\n",
359 |         "    objective = input(\"Enter your data extraction objective: \")\n",
360 |         "\n",
361 |         "    print(\"\\nFinding relevant pages...\")\n",
362 |         "    links = find_relevant_pages(objective, url)\n",
363 |         "\n",
364 |         "    if not links:\n",
365 |         "        print(\"No relevant pages found.\")\n",
366 |         "        return\n",
367 |         "\n",
368 |         "    print(f\"\\nFound {len(links)} relevant pages:\")\n",
369 |         "    for i, link in enumerate(links[:3], 1):\n",
370 |         "        pprint.pprint(f\"{i}. {link}\")\n",
371 |         "\n",
372 |         "    print(\"\\nExtracting data from pages...\")\n",
373 |         "    data = extract_data_from_pages(links, objective)\n",
374 |         "\n",
375 |         "    if data:\n",
376 |         "        print(\"\\nData extracted successfully:\")\n",
377 |         "        pprint.pprint(json.dumps(data, indent=2))\n",
378 |         "    else:\n",
379 |         "        print(\"Could not find data matching the objective.\")"
380 |       ]
381 |     },
382 |     {
383 |       "cell_type": "code",
384 |       "execution_count": null,
385 |       "metadata": {
386 |         "id": "TtIGh9jSVk5Z"
387 |       },
388 |       "outputs": [],
389 |       "source": [
390 |         "# Run the crawler\n",
391 |         "main()"
392 |       ]
393 |     },
394 |     {
395 |       "cell_type": "markdown",
396 |       "metadata": {
397 |         "id": "PZPOttGoVk5Z"
398 |       },
399 |       "source": [
400 |         "## What's Next?\n",
401 |         "\n",
402 |         "Now that you have a working web crawler, consider these enhancements:\n",
403 |         "\n",
404 |         "1. Add error handling and retries\n",
405 |         "2. Implement concurrent processing\n",
406 |         "3. Add content filtering and validation\n",
407 |         "4. Create custom extraction rules\n",
408 |         "\n",
409 |         "The combination of Grok-2 and Firecrawl offers powerful possibilities for intelligent web scraping and content analysis.\n",
410 |         "\n",
411 |         "## Additional Resources\n",
412 |         "\n",
413 |         "- [x.ai Grok-2 API Documentation](https://api.x.ai/docs)\n",
414 |         "- [Firecrawl Python Library Documentation](https://docs.firecrawl.dev)\n",
415 |         "- [Example Code Repository](https://github.com/example/web-crawler)"
416 |       ]
417 |     }
418 |   ]
419 | }
420 | 


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/README.md:
--------------------------------------------------------------------------------
  1 | <a href="https://x.com/alxfazio" target="_blank">
  2 |   <picture>
  3 |     <source media="(prefers-color-scheme: dark)" srcset="./images/hf-daily-papers-github-logo.png">
  4 |     <img alt="Firecrawl Quickstarts Logo" src="./images/hf-daily-papers-github-logo.png" width="400px" style="max-width: 100%; margin-bottom: 20px;">
  5 |   </picture>
  6 | </a>
  7 | 
  8 | # Hugging Face "Daily Papers" Tracker
  9 | 
 10 | This system provides **automated notifications** about the latest white papers published on the [Hugging Face Daily Papers](https://huggingface.co/papers) page. Using **Firecrawl's semantic crawling and scraping capabilities** (Crawl and Extract APIs), it fetches and processes new publications daily. The system uses semantic filtering to determine which papers are most relevant to the user's interests, based on a user-defined category prompt, and delivers notifications directly to Discord.
 11 | 
 12 | ### Key Features
 13 | - **Daily Notifications**: Receive real-time updates about the latest papers that match your research interests.
 14 | - **Firecrawl Integration**:
 15 |   - **Crawl API**: Retrieves the list of newly published papers from the Hugging Face Daily Papers page.
 16 |   - **Extract API**: Extracts structured and semantically enriched data from each paper for filtering and analysis.
 17 | - **Semantic Filtering**: Matches papers to the user's category of interest using a customizable category prompt.
 18 | - **Customizable Interests**: Easily define your research area by editing the `category_prompt` file.
 19 | - **Default Configuration**: Preconfigured to track papers related to AI Agents, but can be adapted for any topic.
 20 | 
 21 | ### How It Works
 22 | 1. **Crawl**: The system uses the Firecrawl Crawl API to retrieve today's list of papers from the Hugging Face Daily Papers page.
 23 | 2. **Extract**: It processes and extracts structured semantic data from each paper using the Firecrawl Extract API.
 24 | 3. **Store**: All extracted paper data is stored in a **Supabase database** for future reference and analysis.
 25 | 4. **Filter**: Semantic filtering identifies papers relevant to the category defined in `category_prompt`.
 26 | 5. **Notify**: Sends summaries of the filtered papers directly to a Discord channel.
 27 | 
 28 | ### System Architecture
 29 | ```mermaid
 30 | flowchart TB
 31 |     subgraph entry ["Entry Points"]
 32 |         CLI["CLI Arguments\n--url or --date"]
 33 |         GHA["GitHub Actions\nScheduled/Manual Trigger"]
 34 |     end
 35 | 
 36 |     subgraph hf_white_paper_tracker ["hf_white_paper_tracker.py"]
 37 |         A[Initialize Logger] --> B[Initialize Database]
 38 |         B --> C[Verify DB Connection/Version]
 39 |         C --> D{"URL Source?"}
 40 |         D -->|CLI Args| E1["Use Provided URL"]
 41 |         D -->|No Args| E2["Get Today's URL"]
 42 |         E1 --> F[Run Paper Tracker]
 43 |         E2 --> F
 44 |     end
 45 | 
 46 |     subgraph firecrawl_crawl_extract ["firecrawl_crawl_extract.py"]
 47 |         G[Extract Paper URLs] --> H[Process Paper Batch]
 48 |         H --> I[Extract Paper Details]
 49 |     end
 50 | 
 51 |     subgraph semantic_filter ["semantic_filter.py"]
 52 |         J[Load Category Definition] --> K[Classify Paper]
 53 |         K --> L{Meets Criteria?}
 54 |     end
 55 | 
 56 |     subgraph discord_notifications ["discord_notifications.py"]
 57 |         M[Format Message] --> N[Send to Discord]
 58 |     end
 59 | 
 60 |     subgraph supabase_db ["supabase_db.py"]
 61 |         O[Store Paper Data] --> P[Update Status]
 62 |     end
 63 | 
 64 |     %% Module connections
 65 |     CLI -->|Optional| hf_white_paper_tracker
 66 |     GHA -->|Scheduled| hf_white_paper_tracker
 67 |     F -->|Calls| G
 68 |     I -->|Passes paper data| J
 69 |     L -->|Yes| M
 70 |     L -->|No| O
 71 |     N -->|Update notification status| P
 72 | 
 73 |     %% Environment dependencies
 74 |     env[".env File"] -.->|Config| hf_white_paper_tracker
 75 |     category[category_prompt.py] -.->|Definition| semantic_filter
 76 | 
 77 |     classDef module fill:#f9f,stroke:#333,stroke-width:2px;
 78 |     class hf_white_paper_tracker,firecrawl_crawl_extract,semantic_filter,discord_notifications,supabase_db module;
 79 |     classDef entry fill:#aaf,stroke:#333,stroke-width:2px;
 80 |     class CLI,GHA entry;
 81 | ```
 82 | 
 83 | ### Setup
 84 | 1. Configure your Firecrawl API keys.
 85 | 2. Modify the `category_prompt` to specify your topic of interest.
 86 | 3. Set up a Discord webhook for receiving notifications.
 87 | 4. Run the system to start tracking and getting updates.
 88 | 
 89 | ## Setup
 90 | 
 91 | 1. Clone the repository
 92 | 
 93 | 2. Install Poetry (if not already installed):
 94 |    ```bash
 95 |    curl -sSL https://install.python-poetry.org | python3 -
 96 |    ```
 97 | 
 98 | 3. Install dependencies:
 99 |    ```bash
100 |    poetry install
101 |    ```
102 | 
103 | 4. Ensure PostgreSQL is installed:
104 |    - PostgreSQL must be installed and pg_config must be in your PATH.
105 |    - On macOS (using Homebrew):
106 |      ```bash
107 |      brew install postgresql
108 |      ```
109 |    - Verify pg_config:
110 |      ```bash
111 |      pg_config --version
112 |      ```
113 | 
114 | 5. Set up Supabase Database:
115 |    1. Create a Supabase account at [Supabase](https://supabase.com)
116 |    2. Create a new project
117 |    3. Go to Project Settings > Database
118 |    4. Note down:
119 |       - Database password (set during project creation)
120 |       - Connection string/URI
121 |       - Project reference ID
122 |    5. Enable the PostgREST API:
123 |       - Go to Project Settings > API
124 |       - Ensure PostgREST is enabled
125 |    6. Configure database tables:
126 |       - The application will automatically create the required tables:
127 |         - `papers`: Stores paper information and tracking status
128 |         - `schema_version`: Manages database migrations
129 |    7. Important connection string notes:
130 |       - For local development, use port 5432:
131 |         ```
132 |         POSTGRES_URL=postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres?sslmode=require
133 |         ```
134 |       - For GitHub Actions, use port 6543:
135 |         ```
136 |         POSTGRES_URL=postgresql://postgres.[YOUR-PROJECT-REF]:[YOUR-PASSWORD]@aws-0-[REGION].pooler.supabase.com:6543/postgres?sslmode=require
137 |         ```
138 |       - Remember to URL-encode special characters in your password:
139 |         - `#` → `%23`
140 |         - `$` → `%24`
141 |         - `^` → `%5E`
142 |         - `&` → `%26`
143 |         - `@` stays as `@`
144 | 
145 | 6. Set up Discord Notifications:
146 |    1. Create a Discord server (skip if you already have one)
147 |    2. Create a channel for paper notifications
148 |    3. Configure the webhook:
149 |       - Go to Server Settings > Integrations
150 |       - Click on "Create Webhook" (or edit an existing one)
151 |       - Set a name for your webhook (e.g., "Paper Tracker")
152 |       - Select the channel where notifications should be sent
153 |       - Copy the Webhook URL
154 |    4. Important webhook URL format:
155 |       ```
156 |       https://discord.com/api/webhooks/{webhook.id}/{webhook.token}
157 |       ```
158 |    5. The notifications will include:
159 |       - Paper title
160 |       - Authors
161 |       - Abstract (first 500 characters)
162 |       - Engagement stats (upvotes and comments)
163 |       - Links to:
164 |         - PDF version
165 |         - arXiv page
166 |         - GitHub repository (if available)
167 |         - Original HuggingFace post
168 |    6. Security notes:
169 |       - Keep your webhook URL private
170 |       - The webhook URL contains a secret token
171 |       - If compromised, you can regenerate the webhook token in Discord
172 |       - Add the URL to your `.env` file:
173 |         ```
174 |         DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/YOUR_WEBHOOK_ID/YOUR_WEBHOOK_TOKEN
175 |         ```
176 | 
177 | 7. Configure environment variables:
178 |    ```bash
179 |    cp .env.example .env
180 |    ```
181 |    Then edit `.env` with:
182 | 
183 |    a. Discord Webhook URL:
184 |    1. Go to your Discord server
185 |    2. Edit a channel > Integrations > Create Webhook
186 |    3. Copy the Webhook URL
187 |    4. Add to `.env`:
188 |       ```
189 |       DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/YOUR_WEBHOOK_ID/YOUR_WEBHOOK_TOKEN
190 |       ```
191 | 
192 |    b. Firecrawl API Key:
193 |    1. Sign up at [Firecrawl](https://firecrawl.co)
194 |    2. Go to API Keys section
195 |    3. Create a new API key
196 |    4. Add to `.env`:
197 |       ```
198 |       FIRECRAWL_API_KEY=fc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
199 |       ```
200 | 
201 |    c. Supabase Database URL:
202 |    1. Use the connection string from your Supabase setup (Step 5)
203 |    2. Add to `.env`:
204 |       ```
205 |       POSTGRES_URL=<your-connection-string>
206 |       ```
207 | 
208 |    d. OpenAI API Key:
209 |    1. Sign up at [OpenAI](https://platform.openai.com)
210 |    2. Go to API Keys section
211 |    3. Create a new API key
212 |    4. Add to `.env`:
213 |       ```
214 |       OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
215 |       ```
216 | 
217 | 8. Configure paper category filtering:
218 |    ```bash
219 |    cp category_prompt.example.py category_prompt.py
220 |    ```
221 |    Then edit `category_prompt.py` to define your `DESIRED_CATEGORY`. This string defines what papers are considered relevant and will trigger notifications. 
222 |    
223 |    The default configuration is set up for "AI Agents" papers, but you can modify it for your needs.
224 | 
225 |    The semantic filter uses this definition to determine:
226 |    - Which papers trigger Discord notifications
227 |    - Classification confidence threshold (default: 0.8)
228 |    - Categorization criteria for the LLM-based filter
229 | 
230 | Your final `.env` file should look like:
231 | ```
232 | DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/1234567890/abcdef...
233 | FIRECRAWL_API_KEY=fc-f6ff27d623e548f390bdc0b9debefe59
234 | POSTGRES_URL=postgresql://postgres:mypassword123@db.abcdefghijklm.supabase.co:5432/postgres?sslmode=require
235 | OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
236 | ```
237 | 
238 | TODO: Create a `.env.example` file with placeholder values after testing is complete, to serve as a template for new users.
239 | TODO: Add to readme how to use the main function firecrawl_crawl_extract.py with arguments to set
240 | the date for the papers to extract.
241 | 
242 | ## Database Schema
243 | 
244 | The application uses a PostgreSQL database with the following schema:
245 | 
246 | ### Papers Table
247 | 
248 | | Column                  | Type      | Description                                           |
249 | |------------------------|-----------|-------------------------------------------------------|
250 | | url                    | String    | Primary key - unique paper URL                        |
251 | | title                  | String    | Paper title                                           |
252 | | authors                | String[]  | Array of author names                                 |
253 | | abstract               | Text      | Paper abstract                                        |
254 | | pdf_url                | String    | URL to PDF version                                    |
255 | | arxiv_url              | String    | URL to arXiv page                                     |
256 | | github_url             | String    | URL to GitHub repository                              |
257 | | publication_date       | DateTime  | Original publication date                             |
258 | | submission_date        | DateTime  | Date added to HuggingFace                            |
259 | | upvotes                | Integer   | Current number of upvotes (default: 0)                |
260 | | comments               | Integer   | Current number of comments (default: 0)               |
261 | | last_updated          | DateTime  | Last time the record was updated                      |
262 | | notification_sent     | Boolean   | Whether notification was sent (default: false)        |
263 | | extraction_success    | Boolean   | Whether extraction succeeded (default: true)          |
264 | | extraction_error      | Text      | Error message if extraction failed (nullable)         |
265 | | last_extraction_attempt| DateTime  | When the last extraction was attempted               |
266 | 
267 | ### Schema Version Table
268 | 
269 | | Column     | Type      | Description                          |
270 | |-----------|-----------|--------------------------------------|
271 | | version   | Integer   | Schema version number                 |
272 | | applied_at| Timestamp | When this version was applied        |
273 | 
274 | ## Deployment Options
275 | 
276 | ### Local Deployment
277 | Follow the setup instructions above for running the tracker locally.
278 | 
279 | ### GitHub Actions Deployment (Recommended)
280 | This project includes GitHub Actions workflow configuration for automated paper tracking. The workflow:
281 | - Runs every 12 hours automatically
282 | - Can be triggered manually via workflow_dispatch
283 | - Sends notifications on both success and failure
284 | 
285 | To set up with GitHub Actions:
286 | 
287 | 1. Fork/clone this repository to your GitHub account
288 | 
289 | 2. Set up GitHub Secrets:
290 |    Go to your repository's Settings > Secrets and Variables > Actions and add the following secrets:
291 | 
292 |    ```
293 |    DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/YOUR_WEBHOOK_ID/YOUR_WEBHOOK_TOKEN
294 |    FIRECRAWL_API_KEY=fc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
295 |    POSTGRES_URL=postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres?sslmode=require
296 |    OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
297 |    ```
298 | 
299 |    Note: Make sure your Supabase database URL uses port 6543 for GitHub Actions compatibility.
300 | 
301 | 3. Configure your tracking preferences:
302 |    - Fork the repository
303 |    - Edit `category_prompt.py` to define your paper interests
304 |    - Commit the changes to your repository
305 | 
306 | 4. Enable GitHub Actions:
307 |    - Go to your repository's Actions tab
308 |    - Enable workflows if they're not already enabled
309 |    - The `Paper Tracker` workflow will now run automatically every 12 hours
310 | 
311 | 5. Monitor the workflow:
312 |    - Check the Actions tab for run history and logs
313 |    - Failed runs will send notifications to your configured Discord channel
314 |    - Successful runs will send notifications only for relevant papers
315 | 
316 | The workflow is defined in `.github/workflows/paper-tracker.yml` and includes:
317 | - Python 3.10 setup
318 | - Poetry dependency management
319 | - Database connection testing
320 | - Automatic error notifications via Discord
321 | 
322 | You can also trigger the workflow manually from the Actions tab using the "Run workflow" button.
323 | 
324 | ## TODOs
325 | 
326 | [ ] Update the GitHub Actions workflow to trigger based on actual changes to the 
327 |    Hugging Face Daily Papers page instead of using a scheduled cron job. This can be accomplished by:
328 |    1. Creating a GitHub Action that checks for changes to the page content/hash
329 |    2. Only triggering the main paper tracking workflow when changes are detected
330 |    3. This will ensure:
331 |       - More timely updates as papers are published
332 |       - More efficient use of API credits by avoiding unnecessary checks
333 |       - Reduced latency between paper publication and notification
334 | [ ] Implement Hugging Face Daily Papers API to reduce API costs in calling scraping, and crawling models.
335 |    - API: https://huggingface.co/api/daily_papers
336 |    - Docs: https://huggingface.co/docs/hub/en/api#paper-pages-api
337 | 
338 | ### X (Twitter) API Setup
339 | 
340 | The system uses X's OAuth 2.0 for posting updates. This requires a one-time local authorization:
341 | 
342 | 1. Configure X API credentials in `.env`:
343 |    ```
344 |    X_OAUTH2_CLIENT_ID=your_client_id
345 |    X_OAUTH2_CLIENT_SECRET=your_client_secret
346 |    ```
347 | 
348 | 2. Run the script locally ONCE to authorize:
349 |    ```bash
350 |    python x_post.py
351 |    ```
352 |    This will:
353 |    - Open a browser window
354 |    - Prompt for X account authorization
355 |    - Store OAuth tokens in your `.env` file:
356 |      ```
357 |      X_ACCESS_TOKEN=...
358 |      X_REFRESH_TOKEN=...
359 |      X_TOKEN_EXPIRES_IN=...
360 |      X_TOKEN_SCOPE=...
361 |      ```
362 | 
363 | 3. After authorization:
364 |    - The tokens are stored in `.env`
365 |    - Copy the updated `.env` to your server
366 |    - Subsequent runs will use stored tokens
367 |    - No browser interaction needed
368 |    - Works in automated environments
369 | 
370 | 4. To disable X posting:
371 |    - Comment out the token-related variables in `.env`:
372 |      ```
373 |      # X_ACCESS_TOKEN=...
374 |      # X_REFRESH_TOKEN=...
375 |      # X_TOKEN_EXPIRES_IN=...
376 |      # X_TOKEN_SCOPE=...
377 |      ```
378 |    - Uncomment to re-enable posting
379 |    - Original credentials are preserved for future use
380 | 
381 | Note: Keep your `.env` file secure and never commit it to version control.
382 | 
383 | ## Local Testing with Historical Dates
384 | 
385 | The system supports testing with historical paper data using the `--date` argument:
386 | 
387 | ```bash
388 | # Format: YYYY-MM-DD
389 | python hf_white_paper_tracker.py --date 2024-03-15
390 | ```
391 | 
392 | This is particularly useful for:
393 | - Testing the system with known paper data
394 | - Backfilling missing papers from specific dates
395 | - Debugging extraction issues with historical content
396 | - Verifying database updates without waiting for new papers
397 | 
398 | You can also provide a full URL using the `--url` argument:
399 | ```bash
400 | python hf_white_paper_tracker.py --url "https://huggingface.co/papers?date=2024-03-15"
401 | ```
402 | 
403 | The system prioritizes arguments in this order:
404 | 1. `--url` (if provided)
405 | 2. `--date` (if provided)
406 | 3. Today's date (default)
407 | 
408 | This flexibility makes local development and testing much more efficient, as you don't need to wait for new papers to be published to verify your changes.


--------------------------------------------------------------------------------
/examples/firecrawl_automated_whitepaper_tracking/x_post_v2.py:
--------------------------------------------------------------------------------
  1 | """Module for posting research papers to X (Twitter)."""
  2 | 
  3 | import os
  4 | import base64
  5 | import hashlib
  6 | import secrets
  7 | import webbrowser
  8 | from http.server import HTTPServer, BaseHTTPRequestHandler
  9 | from urllib.parse import parse_qs, urlparse
 10 | from dotenv import load_dotenv, set_key, find_dotenv
 11 | import requests
 12 | from typing import Optional
 13 | from logging_config import setup_base_logging, log_function_call
 14 | import tempfile
 15 | from playwright.sync_api import sync_playwright
 16 | import mimetypes
 17 | import random
 18 | from datetime import datetime
 19 | from requests_oauthlib import OAuth1
 20 | import time
 21 | 
 22 | # Configure logging using the centralized configuration
 23 | logger = setup_base_logging(
 24 |     logger_name="x_poster",
 25 |     log_file="x_poster.log",
 26 |     format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s'
 27 | )
 28 | 
 29 | load_dotenv()
 30 | 
 31 | AUTH_URL = "https://twitter.com/i/oauth2/authorize"
 32 | TOKEN_URL = "https://api.twitter.com/2/oauth2/token"
 33 | CALLBACK_URL = "http://127.0.0.1:8000/callback"
 34 | X_API_URL = "https://api.twitter.com/2/tweets"
 35 | MEDIA_UPLOAD_URL = "https://upload.twitter.com/1.1/media/upload.json"
 36 | POST_UPDATE_URL = "https://api.twitter.com/1.1/statuses/update.json"  # v1.1 endpoint
 37 | PDF_SCREENSHOT_SIZE = 1080  # 1:1 ratio
 38 | PDF_LOAD_TIMEOUT = 30000  # 30 seconds
 39 | 
 40 | @log_function_call
 41 | def generate_pkce_pair():
 42 |     """Generate PKCE code verifier and challenge"""
 43 |     code_verifier = secrets.token_urlsafe(64)
 44 |     code_challenge = base64.urlsafe_b64encode(
 45 |         hashlib.sha256(code_verifier.encode()).digest()
 46 |     ).rstrip(b'=').decode()
 47 |     
 48 |     logger.debug(f"Generated PKCE - Verifier: {len(code_verifier)} chars, Challenge: {len(code_challenge)} chars")
 49 |     return code_verifier, code_challenge
 50 | 
 51 | class CallbackHandler(BaseHTTPRequestHandler):
 52 |     """Handle OAuth callback"""
 53 |     code = None
 54 |     
 55 |     def log_message(self, format, *args):
 56 |         """Override to use our logger"""
 57 |         logger.info(f"OAuth Callback: {format%args}")
 58 |     
 59 |     def do_GET(self):
 60 |         """Process callback GET request"""
 61 |         logger.info(f"Received callback request: {self.path}")
 62 |         
 63 |         query = parse_qs(urlparse(self.path).query)
 64 |         CallbackHandler.code = query.get('code', [None])[0]
 65 |         
 66 |         if CallbackHandler.code:
 67 |             logger.info("Successfully received authorization code")
 68 |         else:
 69 |             logger.error(f"No authorization code in callback. Query params: {query}")
 70 |             
 71 |         # Log any error parameters
 72 |         if 'error' in query:
 73 |             logger.error(f"Error in callback: {query['error']}")
 74 |         if 'error_description' in query:
 75 |             logger.error(f"Error description: {query['error_description']}")
 76 |             
 77 |         self.send_response(200)
 78 |         self.send_header('Content-type', 'text/html')
 79 |         self.end_headers()
 80 |         self.wfile.write(b"Authorization successful! You can close this window.")
 81 |         
 82 | @log_function_call
 83 | def load_stored_tokens():
 84 |     """Load stored OAuth 2.0 tokens from .env"""
 85 |     dotenv_path = find_dotenv()
 86 |     load_dotenv(dotenv_path)
 87 |     
 88 |     # Use OAuth 2.0 specific variables
 89 |     access_token = os.getenv('X_OAUTH2_ACCESS_TOKEN')
 90 |     refresh_token = os.getenv('X_OAUTH2_REFRESH_TOKEN')
 91 |     
 92 |     if access_token and refresh_token:
 93 |         logger.debug("Successfully loaded stored OAuth 2.0 tokens")
 94 |         return {
 95 |             'access_token': access_token,
 96 |             'refresh_token': refresh_token,
 97 |             'expires_in': os.getenv('X_OAUTH2_TOKEN_EXPIRES_IN'),
 98 |             'scope': os.getenv('X_OAUTH2_TOKEN_SCOPE')
 99 |         }
100 |     logger.warning("No stored OAuth 2.0 tokens found")
101 |     return None
102 | 
103 | @log_function_call
104 | def save_tokens(tokens):
105 |     """Save OAuth 2.0 tokens to .env"""
106 |     dotenv_path = find_dotenv()
107 |     
108 |     try:
109 |         # Use different variable names for OAuth 2.0
110 |         set_key(dotenv_path, 'X_OAUTH2_ACCESS_TOKEN', tokens['access_token'])
111 |         set_key(dotenv_path, 'X_OAUTH2_REFRESH_TOKEN', tokens['refresh_token'])
112 |         set_key(dotenv_path, 'X_OAUTH2_TOKEN_EXPIRES_IN', str(tokens['expires_in']))
113 |         set_key(dotenv_path, 'X_OAUTH2_TOKEN_SCOPE', tokens['scope'])
114 |         load_dotenv(dotenv_path)
115 |         logger.info("Successfully saved OAuth 2.0 tokens to .env")
116 |     except Exception as e:
117 |         logger.error(f"Failed to save tokens: {str(e)}")
118 |         raise
119 | 
120 | def refresh_access_token(refresh_token):
121 |     """Get new access token using refresh token"""
122 |     auth = (
123 |         os.getenv('X_OAUTH2_CLIENT_ID'),
124 |         os.getenv('X_OAUTH2_CLIENT_SECRET')
125 |     )
126 |     
127 |     data = {
128 |         'refresh_token': refresh_token,
129 |         'grant_type': 'refresh_token'
130 |     }
131 |     
132 |     response = requests.post(TOKEN_URL, auth=auth, data=data)
133 |     return response.json()
134 | 
135 | def get_oauth2_token():
136 |     """Get OAuth 2.0 token, using stored tokens if available"""
137 |     # Try to load stored tokens
138 |     tokens = load_stored_tokens()
139 |     
140 |     if tokens:
141 |         logger.debug(f"Found stored tokens with keys: {list(tokens.keys())}")
142 |         if 'refresh_token' in tokens:
143 |             try:
144 |                 logger.info("Attempting to refresh token...")
145 |                 new_tokens = refresh_access_token(tokens['refresh_token'])
146 |                 logger.debug(f"Refresh response keys: {list(new_tokens.keys())}")
147 |                 save_tokens(new_tokens)
148 |                 return new_tokens['access_token']
149 |             except Exception as e:
150 |                 logger.error(f"Token refresh failed: {e}")
151 |         else:
152 |             logger.error("Stored tokens missing refresh_token")
153 |     
154 |     # If no stored tokens or refresh failed, do full authorization
155 |     logger.info("Starting OAuth 2.0 PKCE flow")
156 |     code_verifier, code_challenge = generate_pkce_pair()
157 |     
158 |     auth_params = {
159 |         'response_type': 'code',
160 |         'client_id': os.getenv('X_OAUTH2_CLIENT_ID'),
161 |         'redirect_uri': CALLBACK_URL,
162 |         'scope': 'tweet.write tweet.read users.read offline.access',
163 |         'code_challenge': code_challenge,
164 |         'code_challenge_method': 'S256',
165 |         'state': secrets.token_urlsafe(32)
166 |     }
167 |     
168 |     logger.info("Starting local callback server")
169 |     server = HTTPServer(('127.0.0.1', 8000), CallbackHandler)
170 |     
171 |     auth_url = f"{AUTH_URL}?{'&'.join(f'{k}={v}' for k,v in auth_params.items())}"
172 |     logger.info(f"Opening authorization URL: {auth_url}")
173 |     webbrowser.open(auth_url)
174 |     
175 |     logger.info("Waiting for callback...")
176 |     server.handle_request()
177 |     
178 |     if not CallbackHandler.code:
179 |         raise Exception("Failed to get authorization code")
180 |     
181 |     logger.info("Exchanging code for token")
182 |     token_data = {
183 |         'code': CallbackHandler.code,
184 |         'grant_type': 'authorization_code',
185 |         'client_id': os.getenv('X_OAUTH2_CLIENT_ID'),
186 |         'redirect_uri': CALLBACK_URL,
187 |         'code_verifier': code_verifier
188 |     }
189 |     
190 |     auth = (
191 |         os.getenv('X_OAUTH2_CLIENT_ID'),
192 |         os.getenv('X_OAUTH2_CLIENT_SECRET')
193 |     )
194 |     
195 |     response = requests.post(TOKEN_URL, auth=auth, data=token_data)
196 |     if response.status_code != 200:
197 |         raise Exception(f"Token exchange failed: {response.text}")
198 |         
199 |     token_json = response.json()
200 |     logger.debug(f"Received token response with keys: {list(token_json.keys())}")
201 |     save_tokens(token_json)
202 |     return token_json['access_token']
203 | 
204 | def format_post(
205 |     paper_title: str,
206 |     authors: list,
207 |     url: str,
208 |     pdf_url: Optional[str] = None,
209 |     arxiv_url: Optional[str] = None,
210 |     github_url: Optional[str] = None
211 | ) -> str:
212 |     """Format paper details into X post text"""
213 |     # Start with title and truncate if needed
214 |     post = f"📚 {paper_title[:100]}{'...' if len(paper_title) > 100 else ''}\n\n"
215 |     
216 |     # Add authors (limited to first 2 if many)
217 |     if len(authors) > 2:
218 |         authors_text = f"by {', '.join(authors[:2])} et al."
219 |     else:
220 |         authors_text = f"by {', '.join(authors)}"
221 |     post += f"{authors_text}\n\n"
222 |     
223 |     # Add links
224 |     post += f"🔗 {url}"
225 |     if pdf_url:
226 |         post += f"\n📄 {pdf_url}"
227 |     if arxiv_url:
228 |         post += f"\n📝 {arxiv_url}"
229 |     if github_url:
230 |         post += f"\n💻 {github_url}"
231 |         
232 |     return post
233 | 
234 | @log_function_call
235 | def capture_pdf_screenshot(pdf_url: str) -> Optional[bytes]:
236 |     """Capture first page screenshot of PDF using Playwright."""
237 |     logger.info(f"Attempting to capture screenshot of PDF: {pdf_url}")
238 |     
239 |     try:
240 |         with sync_playwright() as p:
241 |             logger.debug("Launching browser...")
242 |             browser = p.chromium.launch(headless=False)  # Run in non-headless mode for debugging
243 |             
244 |             logger.debug("Creating new page with viewport size: %dx%d", 
245 |                         PDF_SCREENSHOT_SIZE, PDF_SCREENSHOT_SIZE)
246 |             page = browser.new_page(
247 |                 viewport={'width': PDF_SCREENSHOT_SIZE, 'height': PDF_SCREENSHOT_SIZE}
248 |             )
249 |             page.set_default_timeout(PDF_LOAD_TIMEOUT)
250 |             
251 |             logger.debug("Loading PDF...")
252 |             page.goto(pdf_url, wait_until='networkidle')  # Wait for network to be idle
253 |             logger.info("✅ PDF loaded successfully")
254 |             
255 |             # Add small delay to ensure PDF is rendered
256 |             logger.debug("Waiting for PDF to render...")
257 |             page.wait_for_timeout(2000)  # 2 second delay
258 |             
259 |             logger.debug("Taking screenshot...")
260 |             screenshot = page.screenshot()
261 |             
262 |             # Validate screenshot data
263 |             if screenshot and len(screenshot) > 0:
264 |                 logger.info(f"✅ Screenshot captured successfully! Size: {len(screenshot)} bytes")
265 |             else:
266 |                 logger.error("❌ Screenshot data is empty!")
267 |                 return None
268 |             
269 |             browser.close()
270 |             logger.info("Browser closed successfully")
271 |             return screenshot
272 |     except Exception as e:
273 |         logger.error(f"❌ Failed to capture PDF screenshot: {str(e)}")
274 |         logger.error("Exception details:", exc_info=True)
275 |         return None
276 | 
277 | @log_function_call
278 | def check_media_status(media_id: str, auth: OAuth1) -> bool:
279 |     """Check if media has finished processing."""
280 |     status_url = f"https://upload.twitter.com/1.1/media/upload.json?command=STATUS&media_id={media_id}"
281 |     
282 |     try:
283 |         response = requests.get(status_url, auth=auth)
284 |         if response.status_code == 200:
285 |             processing_info = response.json().get('processing_info', {})
286 |             state = processing_info.get('state')
287 |             
288 |             if state == 'succeeded':
289 |                 return True
290 |             elif state == 'pending':
291 |                 time.sleep(3)  # Wait before checking again
292 |                 return check_media_status(media_id, auth)
293 |             else:
294 |                 logger.error(f"Media processing failed: {processing_info}")
295 |                 return False
296 |     except Exception as e:
297 |         logger.error(f"Error checking media status: {str(e)}")
298 |         return False
299 | 
300 | @log_function_call
301 | def upload_media(image_data: bytes) -> Optional[str]:
302 |     """Upload media using v1.1 API with OAuth 1.0a."""
303 |     if not image_data:
304 |         logger.error("❌ No image data provided for upload")
305 |         return None
306 |         
307 |     # Create OAuth 1.0a auth
308 |     auth = OAuth1(
309 |         client_key=os.getenv('X_API_KEY'),
310 |         client_secret=os.getenv('X_API_SECRET'),
311 |         resource_owner_key=os.getenv('X_ACCESS_TOKEN'),
312 |         resource_owner_secret=os.getenv('X_ACCESS_TOKEN_SECRET')
313 |     )
314 |     
315 |     try:
316 |         logger.info("📤 Uploading media using v1.1 API...")
317 |         files = {'media': image_data}
318 |         response = requests.post(MEDIA_UPLOAD_URL, auth=auth, files=files)
319 |         
320 |         if response.status_code != 200:
321 |             logger.error(f"❌ Media upload failed: {response.status_code}")
322 |             logger.error(f"Response: {response.text}")
323 |             return None
324 |             
325 |         media_id = response.json()['media_id_string']
326 |         logger.info(f"✅ Media upload successful! (ID: {media_id})")
327 |         return media_id
328 |         
329 |     except Exception as e:
330 |         logger.error(f"❌ Failed to upload media: {str(e)}")
331 |         logger.error("Exception details:", exc_info=True)
332 |         return None
333 | 
334 | @log_function_call
335 | def wait_for_rate_limit(response) -> bool:
336 |     """Handle rate limit waiting. Returns True if waited, False if no wait needed."""
337 |     try:
338 |         # Only proceed if we have rate limit headers
339 |         if 'x-rate-limit-remaining' not in response.headers:
340 |             return False
341 |             
342 |         limit = int(response.headers.get('x-rate-limit-limit', 0))
343 |         remaining = int(response.headers.get('x-rate-limit-remaining', 0))
344 |         reset_time = int(response.headers.get('x-rate-limit-reset', 0))
345 |         
346 |         logger.info("Rate Limit Status:")
347 |         logger.info(f"- Limit: {limit} requests")
348 |         logger.info(f"- Remaining: {remaining} requests")
349 |         
350 |         # Only wait if we're actually rate limited
351 |         if response.status_code == 429:
352 |             current_time = time.time()
353 |             wait_seconds = max(reset_time - current_time, 0)
354 |             
355 |             reset_datetime = datetime.fromtimestamp(reset_time)
356 |             logger.info(f"Rate limit reset time: {reset_datetime}")
357 |             
358 |             if wait_seconds > 0:
359 |                 logger.info(f"Waiting {wait_seconds:.0f} seconds...")
360 |                 time.sleep(wait_seconds)
361 |                 return True
362 |                 
363 |         return False
364 |         
365 |     except Exception as e:
366 |         logger.error(f"Error in rate limit handling: {str(e)}")
367 |         return False
368 | 
369 | @log_function_call
370 | def post_paper(paper_title: str, authors: list, url: str, pdf_url: Optional[str] = None, 
371 |                arxiv_url: Optional[str] = None, github_url: Optional[str] = None, max_retries: int = 3):
372 |     """Post paper using v2 API."""
373 |     logger.info(f"🔄 Starting post process for: {paper_title}")
374 |     
375 |     # Step 1: Upload media using v1.1 API if PDF URL provided
376 |     media_id = None
377 |     if pdf_url:
378 |         logger.info("📸 Capturing PDF screenshot...")
379 |         screenshot = capture_pdf_screenshot(pdf_url)
380 |         if screenshot:
381 |             media_id = upload_media(screenshot)  # Uses v1.1 API with OAuth 1.0a
382 |     
383 |     # Step 2: Get OAuth 2.0 token for v2 API post
384 |     token = get_oauth2_token()
385 |     if not token:
386 |         logger.error("❌ Failed to get OAuth 2.0 token")
387 |         return None
388 |     
389 |     # Format post text
390 |     post_text = format_post(paper_title, authors, url, pdf_url, arxiv_url, github_url)
391 |     
392 |     # Prepare v2 API payload
393 |     payload = {
394 |         "text": post_text
395 |     }
396 |     
397 |     # Add media if available
398 |     if media_id:
399 |         payload["media"] = {
400 |             "media_ids": [str(media_id)]
401 |         }
402 |     
403 |     headers = {
404 |         "Authorization": f"Bearer {token}",
405 |         "Content-Type": "application/json"
406 |     }
407 |     
408 |     # Post using v2 API
409 |     retry_count = 0
410 |     while retry_count < max_retries:
411 |         try:
412 |             logger.info("📝 Creating post using v2 API...")
413 |             response = requests.post(X_API_URL, json=payload, headers=headers)
414 |             
415 |             logger.debug(f"Response status: {response.status_code}")
416 |             logger.debug(f"Response headers: {dict(response.headers)}")
417 |             logger.debug(f"Response body: {response.text}")  # Log full response
418 |             
419 |             # Log rate limit info regardless of status
420 |             for header, value in response.headers.items():
421 |                 if 'rate' in header.lower():
422 |                     logger.info(f"Rate limit info - {header}: {value}")
423 |             
424 |             if response.status_code == 429:  # True rate limit
425 |                 if wait_for_rate_limit(response):
426 |                     retry_count += 1
427 |                     continue
428 |                 else:
429 |                     logger.error("❌ True rate limit hit")
430 |                     return {"error": "rate_limit", "response": response.text}
431 |                     
432 |             elif response.status_code == 201:  # Success
433 |                 logger.info("✅ Post successful!")
434 |                 return response.json()
435 |                 
436 |             else:  # Other errors
437 |                 error_data = response.json()
438 |                 logger.error(f"❌ Post failed with status {response.status_code}")
439 |                 logger.error(f"Error details: {error_data}")
440 |                 return {"error": "api_error", "response": error_data}
441 |                 
442 |         except Exception as e:
443 |             logger.error(f"❌ Error posting: {str(e)}")
444 |             logger.error("Exception details:", exc_info=True)
445 |             break
446 |             
447 |         retry_count += 1
448 |     
449 |     return None
450 | 
451 | if __name__ == "__main__":
452 |     # Random paper title components
453 |     adjectives = ["Novel", "Advanced", "Innovative", "Comprehensive", "Efficient"]
454 |     topics = ["Machine Learning", "Neural Networks", "Deep Learning", "AI", "Data Science"]
455 |     methods = ["Framework", "Approach", "Methodology", "System", "Architecture"]
456 | 
457 |     # Random author names
458 |     first_names = ["James", "Maria", "John", "Sarah", "Michael", "Emma", "David", "Lisa"]
459 |     last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis"]
460 | 
461 |     # Generate random title and authors
462 |     random_title = f"{random.choice(adjectives)} {random.choice(topics)} {random.choice(methods)} [{datetime.now().strftime('%H:%M:%S')}]"
463 |     random_authors = [
464 |         f"{random.choice(first_names)} {random.choice(last_names)}", 
465 |         f"{random.choice(first_names)} {random.choice(last_names)}"
466 |     ]
467 | 
468 |     logger.info("Starting X post test")
469 |     try:
470 |         response = post_paper(
471 |             paper_title=random_title,
472 |             authors=random_authors,
473 |             url="https://huggingface.co/papers/test",
474 |             pdf_url="https://arxiv.org/pdf/2401.00935",  # Test PDF URL
475 |             arxiv_url="https://arxiv.org/abs/test",
476 |             github_url="https://github.com/test/repo"
477 |         )
478 |         
479 |         if response and 'data' in response:
480 |             logger.info("✅ Post successful!")
481 |             logger.info(f"Tweet ID: {response['data']['id']}")
482 |             logger.info(f"Tweet text: {response['data']['text']}")
483 |         else:
484 |             logger.error("❌ Post failed!")
485 |             logger.error(f"Response: {response}")
486 |             
487 |     except Exception as e:
488 |         logger.error("❌ Error during posting:")
489 |         logger.error(f"Error details: {str(e)}", exc_info=True)
490 | 


--------------------------------------------------------------------------------
/claude_researcher_with_map.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "view-in-github",
  7 |         "colab_type": "text"
  8 |       },
  9 |       "source": [
 10 |         "<a href=\"https://colab.research.google.com/github/alexfazio/firecrawl-quickstarts/blob/main/claude_researcher_with_map.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 11 |       ]
 12 |     },
 13 |     {
 14 |       "cell_type": "markdown",
 15 |       "id": "8f015733",
 16 |       "metadata": {
 17 |         "id": "8f015733"
 18 |       },
 19 |       "source": [
 20 |         "# Firecrawl Web Crawling with OpenAI and Anthropic\n",
 21 |         "This notebook demonstrates how to use the Firecrawl API along with OpenAI's Anthropic to search for specific information on a website. It takes a user-defined objective and website URL, then attempts to find relevant pages and extract information based on the objective.\n",
 22 |         "\n",
 23 |         "### Requirements\n",
 24 |         "1. **Firecrawl API key**: Obtain from your Firecrawl account.\n",
 25 |         "2. **Anthropic API key**: Obtain from Anthropic if you're leveraging their models.\n",
 26 |         "3. **AgentOps API key**: If using AgentOps, include its API key.\n",
 27 |         "\n",
 28 |         "Set up your API keys as environment variables or directly in the notebook for ease of access.\n"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "code",
 33 |       "execution_count": 2,
 34 |       "id": "TuuO7HAFyuq9",
 35 |       "metadata": {
 36 |         "id": "TuuO7HAFyuq9",
 37 |         "colab": {
 38 |           "base_uri": "https://localhost:8080/"
 39 |         },
 40 |         "outputId": "f025d6d5-8b78-4e21-a464-da21217fa515"
 41 |       },
 42 |       "outputs": [
 43 |         {
 44 |           "output_type": "stream",
 45 |           "name": "stdout",
 46 |           "text": [
 47 |             "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/946.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m450.6/946.0 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m942.1/946.0 kB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m946.0/946.0 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 48 |             "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/50.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 49 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 50 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m288.2/288.2 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 51 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 52 |             "\u001b[?25h"
 53 |           ]
 54 |         }
 55 |       ],
 56 |       "source": [
 57 |         "%pip install -q firecrawl-py anthropic agentops"
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "execution_count": 3,
 63 |       "id": "8e2be400",
 64 |       "metadata": {
 65 |         "id": "8e2be400"
 66 |       },
 67 |       "outputs": [],
 68 |       "source": [
 69 |         "from getpass import getpass\n",
 70 |         "from firecrawl import FirecrawlApp\n",
 71 |         "import os, re, json, anthropic, agentops"
 72 |       ]
 73 |     },
 74 |     {
 75 |       "cell_type": "code",
 76 |       "execution_count": 4,
 77 |       "id": "RJZQ-gYpGOl9",
 78 |       "metadata": {
 79 |         "colab": {
 80 |           "base_uri": "https://localhost:8080/"
 81 |         },
 82 |         "id": "RJZQ-gYpGOl9",
 83 |         "outputId": "fa8698d2-7126-4a89-f9f8-46c0b79e7d16"
 84 |       },
 85 |       "outputs": [
 86 |         {
 87 |           "name": "stdout",
 88 |           "output_type": "stream",
 89 |           "text": [
 90 |             "FIRECRAWL_API_KEY··········\n",
 91 |             "ANTHROPIC_API_KEY··········\n"
 92 |           ]
 93 |         },
 94 |         {
 95 |           "output_type": "stream",
 96 |           "name": "stderr",
 97 |           "text": [
 98 |             "DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False\n",
 99 |             "DEBUG:httpx:load_verify_locations cafile='/usr/local/lib/python3.10/dist-packages/certifi/cacert.pem'\n"
100 |           ]
101 |         },
102 |         {
103 |           "name": "stdout",
104 |           "output_type": "stream",
105 |           "text": [
106 |             "AGENTOPS_API_KEY··········\n"
107 |           ]
108 |         }
109 |       ],
110 |       "source": [
111 |         "# Initialize the FirecrawlApp, OpenAI client, and AgentOps\n",
112 |         "app = FirecrawlApp(api_key=getpass('FIRECRAWL_API_KEY'))\n",
113 |         "client = anthropic.Anthropic(api_key=getpass('ANTHROPIC_API_KEY'))\n",
114 |         "AGENTOPS_API_KEY = getpass('AGENTOPS_API_KEY')"
115 |       ]
116 |     },
117 |     {
118 |       "cell_type": "markdown",
119 |       "id": "c625af64",
120 |       "metadata": {
121 |         "id": "c625af64"
122 |       },
123 |       "source": [
124 |         "### Custom Color-Coded Logging Configuration\n",
125 |         "This cell sets up a custom logging configuration with color-coded output for different log levels, enhancing readability for various messages.\n",
126 |         "\n",
127 |         "The `CustomFormatter` class applies specific colors to log levels (DEBUG, INFO, WARNING, ERROR, CRITICAL) and resets colors after each log message.\n",
128 |         "\n",
129 |         "A `StreamHandler` is added to the root logger with this custom formatter, displaying messages in the notebook's output stream.\n"
130 |       ]
131 |     },
132 |     {
133 |       "cell_type": "code",
134 |       "execution_count": 17,
135 |       "id": "98ec7066",
136 |       "metadata": {
137 |         "id": "98ec7066"
138 |       },
139 |       "outputs": [],
140 |       "source": [
141 |         "import logging\n",
142 |         "\n",
143 |         "# Set up colored logging\n",
144 |         "class CustomFormatter(logging.Formatter):\n",
145 |         "    COLORS = {\n",
146 |         "        'DEBUG': '\\033[94m',   # Blue\n",
147 |         "        'INFO': '\\033[92m',    # Green\n",
148 |         "        'WARNING': '\\033[93m', # Yellow\n",
149 |         "        'ERROR': '\\033[91m',   # Red\n",
150 |         "        'CRITICAL': '\\033[95m' # Magenta\n",
151 |         "    }\n",
152 |         "    RESET = '\\033[0m'\n",
153 |         "    FORMAT = \"[%(levelname)s] %(message)s\"\n",
154 |         "\n",
155 |         "    def format(self, record):\n",
156 |         "        log_color = self.COLORS.get(record.levelname, self.RESET)\n",
157 |         "        log_fmt = log_color + self.FORMAT + self.RESET\n",
158 |         "        formatter = logging.Formatter(log_fmt)\n",
159 |         "        return formatter.format(record)\n",
160 |         "\n",
161 |         "# Configure the root logger\n",
162 |         "logger = logging.getLogger()\n",
163 |         "logger.setLevel(logging.INFO)\n",
164 |         "\n",
165 |         "ch = logging.StreamHandler()\n",
166 |         "ch.setLevel(logging.INFO)\n",
167 |         "ch.setFormatter(CustomFormatter())\n",
168 |         "\n",
169 |         "# Add handler if not already added\n",
170 |         "if not logger.hasHandlers():\n",
171 |         "    logger.addHandler(ch)"
172 |       ]
173 |     },
174 |     {
175 |       "cell_type": "markdown",
176 |       "id": "e732d54d",
177 |       "metadata": {
178 |         "id": "e732d54d"
179 |       },
180 |       "source": [
181 |         "### Step 1: Finding the Relevant Page\n",
182 |         "The function `find_relevant_page_via_map` takes an objective and a website URL. It then uses the Anthropic client to generate search parameters for the Firecrawl API to map the website and identify relevant pages based on the objective.\n"
183 |       ]
184 |     },
185 |     {
186 |       "cell_type": "code",
187 |       "execution_count": 18,
188 |       "id": "13f88d4a",
189 |       "metadata": {
190 |         "id": "13f88d4a"
191 |       },
192 |       "outputs": [],
193 |       "source": [
194 |         "# Find the page that most likely contains the objective using Firecrawl's Map\n",
195 |         "def find_relevant_page_via_map(objective, url, app, client):\n",
196 |         "    \"\"\"\n",
197 |         "    Identifies the page most likely to contain the specified objective using Firecrawl's Map.\n",
198 |         "\n",
199 |         "    Args:\n",
200 |         "        objective (str): The objective to search for within the website pages.\n",
201 |         "        url (str): The base URL of the website to be crawled.\n",
202 |         "        app (object): The application instance for conducting the crawl.\n",
203 |         "        client (object): The client used to make requests to the pages.\n",
204 |         "\n",
205 |         "    Returns:\n",
206 |         "        str or None: Returns the URL of the page that most likely contains the objective if found; otherwise, returns None.\n",
207 |         "    \"\"\"\n",
208 |         "    try:\n",
209 |         "        logger.info(f\"{Colors.CYAN}Objective: {objective}{Colors.RESET}\")\n",
210 |         "        logger.info(f\"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}\")\n",
211 |         "\n",
212 |         "        map_prompt = f\"\"\"\n",
213 |         "        The map function generates a list of URLs from a website and accepts a search parameter.\n",
214 |         "        Based on the objective of: {objective}, suggest a 1-2 word search parameter.\n",
215 |         "        \"\"\"\n",
216 |         "\n",
217 |         "        completion = client.messages.create(\n",
218 |         "            model='claude-3-5-sonnet-20241022',\n",
219 |         "            max_tokens=1000,\n",
220 |         "            temperature=0,\n",
221 |         "            system=\"Expert web crawler\",\n",
222 |         "            messages=[{'role': 'user', 'content': map_prompt}]\n",
223 |         "        )\n",
224 |         "\n",
225 |         "        map_search_parameter = completion.content[0].text\n",
226 |         "        map_website = app.map_url(url, params={'search': map_search_parameter})\n",
227 |         "\n",
228 |         "        logger.info(f\"{Colors.GREEN}Mapping completed. Links found: {len(map_website['links'])}{Colors.RESET}\")\n",
229 |         "        return map_website['links']\n",
230 |         "    except Exception as e:\n",
231 |         "        logger.info(f\"{Colors.RED}Error: {str(e)}{Colors.RESET}\")\n",
232 |         "        return None"
233 |       ]
234 |     },
235 |     {
236 |       "cell_type": "markdown",
237 |       "id": "c49f669d",
238 |       "metadata": {
239 |         "id": "c49f669d"
240 |       },
241 |       "source": [
242 |         "### Step 2: Examining Top Pages using Firewcrawl's [Map](https://docs.firecrawl.dev/features/map)\n",
243 |         "The function `find_objective_in_top_pages` examines the top pages from the website map, attempting to fulfill the user's objective using scraped content. If the objective is met, it returns the relevant data in JSON format.\n",
244 |         "\n",
245 |         "**Note:** Firecrawl's Map Response will be an ordered list from the most relevant to the least relevant. By selecting only the first three elements (`[:2]`), the function focuses on analyzing just the top three most relevant pages identified during the mapping stage..\n"
246 |       ]
247 |     },
248 |     {
249 |       "cell_type": "code",
250 |       "execution_count": 19,
251 |       "id": "812fd739",
252 |       "metadata": {
253 |         "id": "812fd739"
254 |       },
255 |       "outputs": [],
256 |       "source": [
257 |         "def find_objective_in_top_pages(map_website, objective, app, client):\n",
258 |         "    \"\"\"\n",
259 |         "    Scrapes the top 3 pages of a given website to check if the specified objective is met.\n",
260 |         "\n",
261 |         "    Args:\n",
262 |         "        map_website (str): The website to be scraped.\n",
263 |         "        objective (str): The objective to look for within the pages.\n",
264 |         "        app (object): The application instance for web scraping.\n",
265 |         "        client (object): The client used to request page content.\n",
266 |         "\n",
267 |         "    Returns:\n",
268 |         "        dict or None: Returns a JSON object if the objective is found within the top 3 pages; otherwise, returns None.\n",
269 |         "    \"\"\"\n",
270 |         "    try:\n",
271 |         "        # Get top 2 links from the map result\n",
272 |         "        top_links = map_website[:2]\n",
273 |         "        logger.info(f\"{Colors.CYAN}Analyzing the {len(top_links)} top links: {top_links}{Colors.RESET}\")\n",
274 |         "\n",
275 |         "        batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})\n",
276 |         "        logger.info(f\"{Colors.GREEN}Batch scraping completed.{Colors.RESET}\")\n",
277 |         "\n",
278 |         "        for scrape_result in batch_scrape_result['data']:\n",
279 |         "            check_prompt = f\"\"\"\n",
280 |         "            Given the following scraped content and objective, determine if the objective is met.\n",
281 |         "            If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.\n",
282 |         "            If the objective is not met with confidence, respond with 'Objective not met'.\n",
283 |         "\n",
284 |         "            Objective: {objective}\n",
285 |         "            Scraped content: {scrape_result['markdown']}\n",
286 |         "\n",
287 |         "            Remember:\n",
288 |         "            1. Only return JSON if you are confident the objective is fully met.\n",
289 |         "            2. Keep the JSON structure as simple and flat as possible.\n",
290 |         "            3. Do not include any explanations or markdown formatting in your response.\n",
291 |         "            \"\"\"\n",
292 |         "\n",
293 |         "            completion = client.messages.create(\n",
294 |         "                model=\"claude-3-5-sonnet-20241022\",\n",
295 |         "                max_tokens=1000,\n",
296 |         "                temperature=0,\n",
297 |         "                system=\"You are an expert web crawler. Respond with the relevant information in JSON format.\",\n",
298 |         "                messages=[\n",
299 |         "                    {\n",
300 |         "                        \"role\": \"user\",\n",
301 |         "                        \"content\": [\n",
302 |         "                            {\n",
303 |         "                                \"type\": \"text\",\n",
304 |         "                                \"text\": check_prompt\n",
305 |         "                            }\n",
306 |         "                        ]\n",
307 |         "                    }\n",
308 |         "                ]\n",
309 |         "            )\n",
310 |         "\n",
311 |         "            result = completion.content[0].text\n",
312 |         "            if result and result != 'Objective not met':\n",
313 |         "                try:\n",
314 |         "                    return json.loads(result)\n",
315 |         "                except json.JSONDecodeError as e:\n",
316 |         "                    logger.info(f\"{Colors.RED}JSON parsing error: {e}. Raw result: {result}{Colors.RESET}\")\n",
317 |         "                    continue  # Skip to the next result if parsing fails\n",
318 |         "\n",
319 |         "        logger.info(f\"{Colors.RED}Objective not met in examined content.{Colors.RESET}\")\n",
320 |         "        return None\n",
321 |         "    except Exception as e:\n",
322 |         "        logger.info(f\"{Colors.RED}Error during analysis: {str(e)}{Colors.RESET}\")\n",
323 |         "        return None"
324 |       ]
325 |     },
326 |     {
327 |       "cell_type": "markdown",
328 |       "id": "l3WzoekQEFCq",
329 |       "metadata": {
330 |         "id": "l3WzoekQEFCq"
331 |       },
332 |       "source": [
333 |         "### Step 3: Find and Extract Information\n",
334 |         "\n",
335 |         "This function aims to find and extract information related to a given `objective` from the top-ranked pages of a website.\n",
336 |         "\n",
337 |         "**Functionality:**\n",
338 |         "\n",
339 |         "1. **Selects Top Links:** It selects the top two URLs from the `map_website` list, assuming they are the most relevant to the objective.\n",
340 |         "2. **Scrapes Content:** It uses the `app.batch_scrape_urls` function to scrape content from these selected URLs in Markdown format.\n",
341 |         "3. **Analyzes Content:**  For each scraped page, it constructs a prompt for the Anthropic Claude model. This prompt asks the model to determine if the scraped content fulfills the `objective`. If it does, the model is asked to extract the relevant information and format it as JSON.\n",
342 |         "4. **Extracts JSON:** The function uses a regular expression to identify JSON-like blocks within the Anth"
343 |       ]
344 |     },
345 |     {
346 |       "cell_type": "markdown",
347 |       "id": "Kc31wkA3EEtl",
348 |       "metadata": {
349 |         "id": "Kc31wkA3EEtl"
350 |       },
351 |       "source": []
352 |     },
353 |     {
354 |       "cell_type": "code",
355 |       "execution_count": 20,
356 |       "id": "02DvK01wELut",
357 |       "metadata": {
358 |         "id": "02DvK01wELut"
359 |       },
360 |       "outputs": [],
361 |       "source": [
362 |         "def find_objective_in_top_pages(map_website, objective, app, client):\n",
363 |         "    \"\"\"\n",
364 |         "    Scrapes the top 3 pages of a website to determine if the specified objective is present.\n",
365 |         "\n",
366 |         "    Args:\n",
367 |         "        map_website (str): The website map or URL structure to guide the scraping.\n",
368 |         "        objective (str): The objective or target content to search for on the pages.\n",
369 |         "        app (object): The application instance used for executing the scraping process.\n",
370 |         "        client (object): The client responsible for requesting the content of the pages.\n",
371 |         "\n",
372 |         "    Returns:\n",
373 |         "        dict or None: Returns a JSON object containing the found objective details if located on one of the top 3 pages; otherwise, returns None.\n",
374 |         "    \"\"\"\n",
375 |         "    try:\n",
376 |         "        top_links = map_website[:2]\n",
377 |         "        logger.info(f\"{Colors.CYAN}Analyzing top links: {top_links}{Colors.RESET}\")\n",
378 |         "\n",
379 |         "        batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})\n",
380 |         "        logger.info(f\"{Colors.GREEN}Batch scraping completed.{Colors.RESET}\")\n",
381 |         "\n",
382 |         "        # Regex pattern to match JSON-like blocks in the response\n",
383 |         "        json_pattern = r\"\\{(?:[^{}]|(?:\\{[^{}]*\\}))*\\}\"\n",
384 |         "\n",
385 |         "        for scrape_result in batch_scrape_result['data']:\n",
386 |         "            check_prompt = f\"\"\"\n",
387 |         "            Given scraped content and objective, determine if the objective is met.\n",
388 |         "            Extract relevant information in simple JSON if met.\n",
389 |         "            Objective: {objective}\n",
390 |         "            Scraped content: {scrape_result['markdown']}\n",
391 |         "            \"\"\"\n",
392 |         "\n",
393 |         "            completion = client.messages.create(\n",
394 |         "                model='claude-3-5-sonnet-20241022',\n",
395 |         "                max_tokens=1000,\n",
396 |         "                temperature=0,\n",
397 |         "                system=\"Expert web crawler\",\n",
398 |         "                messages=[{'role': 'user', 'content': check_prompt}]\n",
399 |         "            )\n",
400 |         "\n",
401 |         "            result = completion.content[0].text\n",
402 |         "            # Search for JSON-like block in the result text\n",
403 |         "            json_match = re.search(json_pattern, result, re.DOTALL)\n",
404 |         "            if json_match:\n",
405 |         "                try:\n",
406 |         "                    return json.loads(json_match.group(0))\n",
407 |         "                except json.JSONDecodeError as e:\n",
408 |         "                    logger.info(f\"{Colors.RED}JSON parsing error: {e}. Raw result: {json_match.group(0)}{Colors.RESET}\")\n",
409 |         "                    continue  # Skip to the next result if parsing fails\n",
410 |         "            else:\n",
411 |         "                logger.info(f\"{Colors.YELLOW}No JSON found in the response. Raw result: {result}{Colors.RESET}\")\n",
412 |         "\n",
413 |         "        logger.info(f\"{Colors.RED}Objective not met in examined content.{Colors.RESET}\")\n",
414 |         "        return None\n",
415 |         "    except Exception as e:\n",
416 |         "        logger.info(f\"{Colors.RED}Error during analysis: {str(e)}{Colors.RESET}\")\n",
417 |         "        return None"
418 |       ]
419 |     },
420 |     {
421 |       "cell_type": "markdown",
422 |       "id": "9fc4cef6",
423 |       "metadata": {
424 |         "id": "9fc4cef6"
425 |       },
426 |       "source": [
427 |         "### Step 4: Executing the Main Function\n",
428 |         "The main function prompts for user input (website URL and objective), calls the `find_relevant_page_via_map` and `find_objective_in_top_pages` functions, and displays results accordingly.\n"
429 |       ]
430 |     },
431 |     {
432 |       "cell_type": "code",
433 |       "execution_count": 21,
434 |       "id": "e3721623",
435 |       "metadata": {
436 |         "id": "e3721623"
437 |       },
438 |       "outputs": [],
439 |       "source": [
440 |         "# Main function to execute the process\n",
441 |         "def main():\n",
442 |         "    url = input(f\"{Colors.BLUE}Enter website URL:{Colors.RESET}\") or \"https://www.firecrawl.dev/\"\n",
443 |         "    objective = input(f\"{Colors.BLUE}Enter objective:{Colors.RESET}\") or \"find pricing plans\"\n",
444 |         "\n",
445 |         "    map_website = find_relevant_page_via_map(objective, url, app, client)\n",
446 |         "\n",
447 |         "    if map_website:\n",
448 |         "        result = find_objective_in_top_pages(map_website, objective, app, client)\n",
449 |         "        if result:\n",
450 |         "            logger.info(f\"{Colors.GREEN}Objective met. Extracted info:{Colors.RESET}\")\n",
451 |         "            logger.info(f\"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}\")\n",
452 |         "        else:\n",
453 |         "            logger.info(f\"{Colors.RED}Objective not fulfilled with available content.{Colors.RESET}\")\n",
454 |         "    else:\n",
455 |         "        logger.info(f\"{Colors.RED}No relevant pages identified.{Colors.RESET}\")"
456 |       ]
457 |     },
458 |     {
459 |       "cell_type": "code",
460 |       "execution_count": 22,
461 |       "id": "0cr6H3nlBSMG",
462 |       "metadata": {
463 |         "colab": {
464 |           "base_uri": "https://localhost:8080/"
465 |         },
466 |         "id": "0cr6H3nlBSMG",
467 |         "outputId": "452e3ae2-fd24-44a9-d0e1-8086f1204ef2"
468 |       },
469 |       "outputs": [
470 |         {
471 |           "name": "stdout",
472 |           "output_type": "stream",
473 |           "text": [
474 |             "\u001b[94mEnter website URL:\u001b[0mhttps://www.firecrawl.dev/\n",
475 |             "\u001b[94mEnter objective:\u001b[0myes or no: is firecrawl backed by y combinator?\n"
476 |           ]
477 |         },
478 |         {
479 |           "output_type": "stream",
480 |           "name": "stderr",
481 |           "text": [
482 |             "INFO:root:\u001b[96mObjective: yes or no: is firecrawl backed by y combinator?\u001b[0m\n",
483 |             "INFO:root:\u001b[96mInitiating search on the website: https://www.firecrawl.dev/\u001b[0m\n",
484 |             "INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages \"HTTP/1.1 200 OK\"\n",
485 |             "INFO:root:\u001b[92mMapping completed. Links found: 42\u001b[0m\n",
486 |             "INFO:root:\u001b[96mAnalyzing top links: ['https://www.firecrawl.dev/blog/your-ip-has-been-temporarily-blocked-or-banned', 'https://www.firecrawl.dev/blog/how-to-quickly-install-beautifulsoup-with-python']\u001b[0m\n",
487 |             "INFO:root:\u001b[92mBatch scraping completed.\u001b[0m\n",
488 |             "INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages \"HTTP/1.1 200 OK\"\n",
489 |             "INFO:root:\u001b[92mObjective met. Extracted info:\u001b[0m\n",
490 |             "INFO:root:\u001b[95m{\n",
491 |             "  \"can_determine\": false,\n",
492 |             "  \"reason\": \"No mention of Y Combinator backing in the scraped content\"\n",
493 |             "}\u001b[0m\n"
494 |           ]
495 |         }
496 |       ],
497 |       "source": [
498 |         "main()"
499 |       ]
500 |     }
501 |   ],
502 |   "metadata": {
503 |     "colab": {
504 |       "provenance": [],
505 |       "include_colab_link": true
506 |     },
507 |     "kernelspec": {
508 |       "display_name": "Python 3",
509 |       "name": "python3"
510 |     },
511 |     "language_info": {
512 |       "name": "python"
513 |     }
514 |   },
515 |   "nbformat": 4,
516 |   "nbformat_minor": 5
517 | }


--------------------------------------------------------------------------------