├── .dockerignore
├── .gitignore
├── .python-version
├── Dockerfile
├── LICENSE
├── README.md
├── agent.py
├── agent_viz.py
├── baml_client
    ├── __init__.py
    ├── async_client.py
    ├── async_request.py
    ├── config.py
    ├── globals.py
    ├── inlinedbaml.py
    ├── parser.py
    ├── partial_types.py
    ├── sync_client.py
    ├── sync_request.py
    ├── tracing.py
    ├── type_builder.py
    └── types.py
├── baml_src
    ├── clients.baml
    ├── generators.baml
    ├── router.baml
    └── summarize.baml
├── bot.py
├── images
    ├── image.png
    └── system_arch.jpg
├── langgraph.json
├── pyproject.toml
├── scripts
    ├── deploy_cloud_run.sh
    ├── deploy_server.sh
    ├── run_docker.sh
    ├── run_local.sh
    └── setup_secrets.sh
├── tools
    ├── __init__.py
    ├── linkedin_agentql_scraper.py
    ├── pdf_handler.py
    ├── search.py
    ├── twitter_api_tool.py
    └── youtube_agentql_scraper.py
└── uv.lock


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Git files
 2 | .git
 3 | .gitignore
 4 | 
 5 | # Python virtual environment
 6 | .venv
 7 | 
 8 | # Python cache
 9 | __pycache__/
10 | *.pyc
11 | *.pyo
12 | *.pyd
13 | 
14 | # OS specific files
15 | .DS_Store
16 | 
17 | # Secrets
18 | .env
19 | 
20 | # We copy pyproject.toml and uv.lock now
21 | # uv.lock
22 | # pyproject.toml
23 | requirements.txt # No longer used by Dockerfile
24 | 
25 | # Other build artifacts if any
26 | .idea/
27 | *.egg-info/
28 | dist/
29 | build/
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # Virtual environments
10 | .venv
11 | .env
12 | __pycache__
13 | 
14 | .langgraph_api


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # # Dockerfile
 2 | # --- 1. use Microsoft's pre-built image (has Chromium + all libs)
 3 | FROM mcr.microsoft.com/playwright/python:v1.52.0-noble
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | # --- 2. install uv
 8 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /usr/local/bin/
 9 | 
10 | # --- 3. copy dependency files
11 | COPY pyproject.toml uv.lock ./
12 | 
13 | # --- 4. install dependencies using uv
14 | RUN uv sync --frozen --no-cache
15 | 
16 | # --- 5. copy code & launch
17 | COPY . .
18 | ENV PORT=8080 PYTHONUNBUFFERED=1
19 | CMD ["uv", "run", "uvicorn", "bot:app", "--host", "0.0.0.0", "--port", "8080"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Telegram Link Summarizer Agent
  2 | 
  3 | [Join the Telegram Channel @tech_links for curated tech news and interesting links](https://t.me/tech_links)
  4 | 
  5 |  [![GitHub Repo stars](https://img.shields.io/github/stars/kargarisaac/telegram_link_summarizer_agent)](https://github.com/kargarisaac/telegram_link_summarizer_agent)
  6 |  [![GitHub forks](https://img.shields.io/github/forks/kargarisaac/telegram_link_summarizer_agent)](https://github.com/kargarisaac/telegram_link_summarizer_agent)
  7 |  [![GitHub License](https://img.shields.io/github/license/kargarisaac/telegram_link_summarizer_agent)](https://github.com/kargarisaac/telegram_link_summarizer_agent)
  8 |  [![Telegram Channel](https://img.shields.io/badge/Telegram-Join%20Channel-blue?logo=telegram)](https://t.me/tech_links)
  9 |  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/kargarisaac/telegram_link_summarizer_agent)
 10 | 
 11 | If you want to get the latest news and interesting links for free, feel free to join the channel. If you find this project useful, giving the repository a star would be appreciated.
 12 | 
 13 | ![Agent Visualization](./images/image.png)
 14 | 
 15 | ![System Diagram](./images/system_arch.jpg)
 16 | 
 17 | 
 18 | An agentic Telegram bot designed to summarize web links (articles, papers, tweets etc.) sent in a chat. It uses LangGraph to orchestrate multiple tools and language models to determine the link type, extract content, and generate concise summaries.
 19 | 
 20 | ## ✨ Features
 21 | 
 22 | *   **Link Summarization:** Extracts content from URLs (webpages, PDFs, Twitter/X, LinkedIn posts) and provides summaries.
 23 | *   **Robust YouTube Support:** Handles YouTube links using Playwright and AgentQL to extract video title and description.
 24 | *   **LLM Routing:** Uses a BAML LLM function (`RouteRequest`) to determine the type of link (Webpage, PDF, Twitter, LinkedIn, Unsupported).
 25 | *   **Web Search/Extraction:** Uses Tavily for standard webpage content extraction.
 26 | *   **PDF Support:** Can process and summarize PDF documents found at URLs.
 27 | *   **Twitter/X Support:** Fetches tweet content (including threads) using the `twitterapi.io` service.
 28 | *   **LinkedIn Support:** Extracts content from LinkedIn post URLs using Playwright and AgentQL.
 29 | *   **Agentic Workflow:** Leverages LangGraph for a multi-step reasoning process.
 30 | *   **BAML Integration:** Uses BAML for structured output generation (summaries and routing).
 31 | *   **Telegram Bot Interface:** Interacts via a simple Telegram bot, replying silently on failure.
 32 | 
 33 | ## 🛠️ Tech Stack
 34 | 
 35 | *   **Routing/Summarization:** BAML (Boundary) + LLM (e.g., Gemini, Deepseek)
 36 | *   **Orchestration:** LangGraph
 37 | *   **YouTube Extraction:** `playwright`, `agentql`
 38 | *   **Twitter/X API:** `twitterapi.io` via `requests`
 39 | *   **Web Extraction:** Tavily Search SDK
 40 | *   **LinkedIn Extraction:** `playwright`, `agentql`
 41 | *   **PDF Extraction:** PyMuPDF (`fitz`)
 42 | *   **Telegram Bot:** `python-telegram-bot`
 43 | *   **Web Framework:** FastAPI + Uvicorn
 44 | *   **Dependencies:** Managed via `pyproject.toml` (using `uv` or `pip`)
 45 | 
 46 | ## 🚀 Setup
 47 | 
 48 | 1.  **Clone the repository:**
 49 |     ```bash
 50 |     git clone <your-repo-url>
 51 |     cd telegram_link_summarizer_agent
 52 |     ```
 53 | 
 54 | 2.  **Install Dependencies (using [`uv`](https://github.com/astral-sh/uv) or `pip`))**
 55 |     *   You can use [`uv`](https://github.com/astral-sh/uv) or standard `pip`:
 56 |         ```bash
 57 |         # Using uv (recommended)
 58 |         uv pip install -e . # Install in editable mode
 59 | 
 60 |         # Or using pip
 61 |         pip install -e . # Install in editable mode
 62 |         ```
 63 |     *   Install Playwright browsers:
 64 |         ```bash
 65 |         playwright install
 66 |         ```
 67 | 
 68 | 3.  **Set up Environment Variables:**
 69 |     Create a file named `.env` in the project root directory. Add the following environment variables with your actual values:
 70 |     ```env
 71 |     # --- Core API Keys ---
 72 |     # Select *one* LLM provider for BAML functions (or configure multi-provider
 73 |     # GEMINI_API_KEY="your_google_gemini_api_key" # For Google LLMs
 74 |     DEEPSEEK_API_KEY="your_deepseek_api_key" # For Deepseek LLMs
 75 |     GOOGLE_API_KEY="your_google_cloud_api_key" # e.g., For Google LLMs or other Google Cloud services
 76 | 
 77 |     # Tools
 78 |     TAVILY_API_KEY="your_tavily_api_key"
 79 |     TWITTER_API_IO_KEY="your_twitterapi.io_api_key" # API Key for twitterapi.io service
 80 |     AGENTQL_API_KEY="your_agentql_api_key" # API Key for AgentQL
 81 | 
 82 |     # --- Telegram Bot Configuration ---
 83 |     TELEGRAM_BOT_TOKEN="your_telegram_bot_token"
 84 | 
 85 |     # --- Webhook Configuration (Needed for deployment or local testing with ngrok) ---
 86 |     # For ngrok, use the https://<your-ngrok-subdomain>.ngrok-free.app URL
 87 |     # For deployment, this isn't strictly needed in the .env for the *deployed* app,
 88 |     # but the deployment script will set the webhook based on the Cloud Run URL.
 89 |     # WEBHOOK_URL="your_webhook_url_or_ngrok_url"
 90 | 
 91 |     # Secure your webhook (generate strong random strings for these)
 92 |     TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_secret_token"
 93 |     # Example: /webhook/aBcDeF12345 - must start with a slash!
 94 |     # WEBHOOK_SECRET_PATH="/your_unique_and_random_webhook_path"
 95 | 
 96 |     # --- Polling vs Webhook Mode (for bot.py) ---
 97 |     # Set USE_POLLING to "true" to run the bot in polling mode (recommended for self-managed servers without HTTPS).
 98 |     # If USE_POLLING is "true", WEBHOOK_URL and related settings are ignored by bot.py.
 99 |     # Defaults to webhook mode if not set or "false".
100 |     # USE_POLLING="true" 
101 | 
102 |     # --- Webhook Configuration (Only if NOT using USE_POLLING="true") ---
103 |     # For local ngrok testing:
104 |     # WEBHOOK_URL="https://your-ngrok-subdomain.ngrok-free.app"
105 |     # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py)
106 |     
107 |     # For self-managed server with public IP (HTTP, for testing - Telegram prefers HTTPS for production):
108 |     # WEBHOOK_URL="http://YOUR_SERVER_IP:8080"
109 |     # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py)
110 | 
111 |     # For self-managed server with domain and HTTPS (Production Webhook):
112 |     # WEBHOOK_URL="https://yourbot.yourdomain.com" # Nginx would proxy to http://localhost:8080
113 |     # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py)
114 |     
115 |     # For Google Cloud Run (this is typically set by the deploy_cloud_run.sh script, not manually in .env):
116 |     # WEBHOOK_URL="your_cloud_run_service_url" 
117 |     # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py)
118 | 
119 |     # Secure your webhook (generate strong random strings for these) - ALWAYS NEEDED FOR WEBHOOK MODE
120 |     # TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_secret_token"
121 |     ```
122 |     **Important:** 
123 |     *   Get your `TWITTER_API_IO_KEY` from [twitterapi.io](https://twitterapi.io/).
124 |     *   Ensure your chosen LLM API Key (`GEMINI_API_KEY` or `DEEPSEEK_API_KEY`) is uncommented and valid.
125 |     *   Keep your `.env` file secure and do not commit it. The `.gitignore` should exclude `.env`.
126 | 
127 | ## ▶️ Usage
128 | 
129 | 1.  **(Optional) Run the Agent Script Directly (for testing specific URLs):**
130 |     *   You can test the core agent logic by running `agent.py`. Modify the test cases at the bottom of the script.
131 |     ```bash
132 |     python agent.py
133 |     ```
134 | 
135 | ## 📊 Agent Visualization
136 | 
137 | The `agent_viz.py` script can be used to generate a visualization of the LangGraph agent (like the image at the top). Ensure `graphviz` is installed (`brew install graphviz` or `sudo apt-get install graphviz`).
138 | 
139 | ```bash
140 | python agent_viz.py
141 | ```
142 | This will generate an `agent_graph.png` file.
143 | 
144 | ## Local Running (Webhook Mode)
145 | 
146 | This runs the FastAPI server using `uvicorn`. This requires `USE_POLLING="false"` and a publicly accessible `WEBHOOK_URL` set in your `.env` file (e.g., using ngrok) for the bot to receive messages from Telegram.
147 | 
148 | Make sure you have installed dependencies (`uv pip install -e .`) and configured your `.env` file.
149 | 
150 | ```bash
151 | # Make the script executable (only needed once)
152 | chmod +x ./scripts/run_local.sh
153 | 
154 | # Run the local server
155 | ./scripts/run_local.sh
156 | ```
157 | 
158 | You can check if the server is running by accessing the health check endpoint: `curl http://localhost:8080/health`
159 | 
160 | ## Testing Webhooks Locally with ngrok
161 | 
162 | When running your bot locally, Telegram cannot reach your computer directly because `localhost` is not accessible from the public internet. To test real Telegram messages with webhooks during development, you can use [ngrok](https://ngrok.com/) to create a secure tunnel from a public URL to your local machine.
163 | 
164 | ### Steps
165 | 
166 | 1.  **Install ngrok:**
167 |    - Download from https://ngrok.com/download or install via your package manager.
168 |    - On macOS (Homebrew):
169 |      ```bash
170 |      brew install ngrok
171 |      ```
172 |    - On Linux:
173 |      ```bash
174 |      sudo snap install ngrok
175 |      ```
176 |    - On Windows: Download and extract the executable from the website.
177 | 
178 | 2.  **Start your local server:**
179 |    ```bash
180 |    # Ensure the script is executable
181 |    chmod +x ./scripts/run_local.sh
182 |    # Run the local server
183 |    ./scripts/run_local.sh
184 |    ```
185 | 
186 | 3.  **Start ngrok to expose port 8080:**
187 |    ```bash
188 |    ngrok http 8080
189 |    ```
190 |    - You will see output like:
191 |      ```
192 |      Forwarding https://abcd-1234.ngrok-free.app -> http://localhost:8080
193 |      ```
194 |    - Copy the HTTPS URL provided by ngrok (e.g., `https://abcd-1234.ngrok-free.app`).
195 | 
196 | 4.  **Update your `.env` file:**
197 |    - Set the `WEBHOOK_URL` to the ngrok HTTPS URL:
198 |      ```env
199 |      WEBHOOK_URL=https://your-ngrok-url.ngrok-free.app
200 |      ```
201 |    - Save the file.
202 | 
203 | 5.  **Restart your local server:**
204 |    - Stop the running `./scripts/run_local.sh` process (Ctrl+C) and start it again:
205 |      ```bash
206 |      ./scripts/run_local.sh
207 |      ```
208 |    - On startup, the bot should attempt to register the webhook with Telegram using your public ngrok URL (if `bot.py` is configured to do so based on `WEBHOOK_URL`).
209 | 
210 | 6.  **Test your bot:**
211 |    - Send a message with a link to your Telegram bot as usual.
212 |    - Telegram will send the update to your ngrok public URL, which forwards it to your local server.
213 |    - You should see logs in your terminal and receive a response from your local bot.
214 | 
215 | **Tip:** If you restart ngrok, you will get a new public URL. Update your `.env` and restart the server each time.
216 | 
217 | **Security Note:** For production, always use a secret path (`WEBHOOK_SECRET_PATH`) and a secret token (`TELEGRAM_WEBHOOK_SECRET_TOKEN`) for your webhook endpoint. For local ngrok testing, ensure these variables are also set in your `.env` if your `bot.py` requires them even locally.
218 | 
219 | ## Docker Testing
220 | 
221 | This builds the Docker image and runs the container locally. Ensure your `.env` file is present and configured in the project root.
222 | 
223 | ```bash
224 | # Make the script executable (only needed once)
225 | chmod +x ./scripts/run_docker.sh
226 | 
227 | # Build and run the Docker container
228 | ./scripts/run_docker.sh
229 | ```
230 | 
231 | You can check the health endpoint at `http://localhost:8080/health`
232 | 
233 | ### Testing Docker Locally with ngrok
234 | 
235 | You can also test the Docker container with ngrok to receive real Telegram messages:
236 | 
237 | 1.  **Run the Docker Container:**
238 |     ```bash
239 |     # Ensure script is executable
240 |     chmod +x ./scripts/run_docker.sh
241 |     # Build and run the container (loads .env)
242 |     ./scripts/run_docker.sh 
243 |     ```
244 |     *(Leave this terminal running)*
245 | 
246 | 2.  **Start ngrok:** In a *new* terminal, run:
247 |     ```bash
248 |     ngrok http 8080 
249 |     ```
250 |     Copy the HTTPS URL provided by ngrok.
251 | 
252 | 3.  **Update `.env`:** Set the `WEBHOOK_URL` variable in your `.env` file to the ngrok HTTPS URL.
253 | 
254 | 4.  **Restart Docker Container:** Stop the running container (Ctrl+C in the first terminal, or `docker stop summarizer-bot`) and restart it using:
255 |     ```bash
256 |     ./scripts/run_docker.sh
257 |     ```
258 |     This ensures the container picks up the new `WEBHOOK_URL` from the `.env` file.
259 | 
260 | 5.  **Test:** Send messages to your bot. They should be routed through ngrok to your running Docker container.
261 | 
262 | ## Deploying to a Self-Managed Server/VM (Docker)
263 | 
264 | This method uses Docker and the provided `scripts/deploy_server.sh` script to deploy the bot to your own virtual machine or dedicated server. This is the recommended approach for self-hosting.
265 | 
266 | ### 1. Server Preparation
267 | 
268 | SSH into your server and ensure `git` and `docker` are installed.
269 | 
270 | ```bash
271 | # Update system (example for Debian/Ubuntu)
272 | sudo apt update && sudo apt upgrade -y
273 | 
274 | # Install Git
275 | sudo apt install -y git
276 | 
277 | # Install Docker
278 | sudo apt install -y docker.io
279 | sudo systemctl start docker
280 | sudo systemctl enable docker
281 | 
282 | # Optional: Add your user to the docker group to run docker commands without sudo
283 | # sudo usermod -aG docker $USER
284 | # newgrp docker # Or log out and log back in
285 | ```
286 | 
287 | ### 2. Clone Repository
288 | 
289 | Clone your repository onto the server:
290 | ```bash
291 | git clone <your-repo-url>
292 | cd telegram_link_summarizer_agent
293 | ```
294 | 
295 | ### 3. Configure Environment (`.env` file)
296 | 
297 | Create a `.env` file in the project root on your server.
298 | 
299 | **Option A: Polling Mode (Recommended for Simplicity)**
300 | This is the easiest way to get started on a self-managed server as it doesn't require a public domain, SSL, or complex firewall/proxy setup beyond allowing outbound connections.
301 | 
302 | ```env
303 | # In your .env file on the server:
304 | USE_POLLING="true"
305 | 
306 | # --- Core API Keys ---
307 | DEEPSEEK_API_KEY="your_deepseek_api_key"
308 | # GEMINI_API_KEY="your_google_gemini_api_key"
309 | TAVILY_API_KEY="your_tavily_api_key"
310 | TWITTER_API_IO_KEY="your_twitterapi.io_api_key"
311 | AGENTQL_API_KEY="your_agentql_api_key"
312 | 
313 | # --- Telegram Bot Configuration ---
314 | TELEGRAM_BOT_TOKEN="your_telegram_bot_token"
315 | 
316 | # --- Webhook related variables can be omitted or commented out when USE_POLLING="true" ---
317 | # WEBHOOK_URL=
318 | # WEBHOOK_SECRET_PATH=
319 | # TELEGRAM_WEBHOOK_SECRET_TOKEN=
320 | ```
321 | 
322 | **Option B: Webhook Mode**
323 | If you prefer webhook mode, you'll need a way for Telegram to reach your bot.
324 | 
325 | *   **Using Server IP (HTTP - for testing only, Telegram prefers HTTPS):**
326 |     ```env
327 |     # .env on server
328 |     USE_POLLING="false" # Or omit
329 |     WEBHOOK_URL="http://YOUR_SERVER_PUBLIC_IP:8080"
330 |     WEBHOOK_SECRET_PATH="your_random_webhook_path_string" # e.g., webhook_bot123 (NO leading slash here)
331 |     TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_token"
332 |     # ... other API keys ...
333 |     ```
334 |     You'll also need to ensure your server's firewall allows inbound traffic on port `8080`.
335 |     ```bash
336 |     sudo ufw allow 8080/tcp
337 |     ```
338 | 
339 | *   **Using a Domain Name (HTTPS - Recommended for Production Webhooks):**
340 |     This involves setting up a domain name pointing to your server, using a reverse proxy like Nginx, and obtaining an SSL certificate (e.g., with Let's Encrypt).
341 |     ```env
342 |     # .env on server
343 |     USE_POLLING="false" # Or omit
344 |     WEBHOOK_URL="https://yourbot.yourdomain.com" # Nginx will handle HTTPS and proxy to the bot
345 |     WEBHOOK_SECRET_PATH="your_random_webhook_path_string"
346 |     TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_token"
347 |     # ... other API keys ...
348 |     ```
349 |     Your Nginx would be configured to listen on port 443 (HTTPS), terminate SSL, and proxy requests for your `WEBHOOK_SECRET_PATH` to `http://localhost:8080`. Firewall should allow port 443.
350 | 
351 | ### 4. Run Deployment Script
352 | 
353 | The `deploy_server.sh` script will build the Docker image and start the container.
354 | ```bash
355 | chmod +x ./scripts/deploy_server.sh
356 | ./scripts/deploy_server.sh
357 | ```
358 | The script uses port `8080` by default.
359 | 
360 | ### 5. Monitoring
361 | 
362 | *   **View logs:** `docker logs -f telegram-summarizer`
363 | *   **Check status:** `docker ps`
364 | *   **Stop:** `docker stop telegram-summarizer`
365 | *   **Start:** `docker start telegram-summarizer`
366 | *   **Restart:** `docker restart telegram-summarizer`
367 | 
368 | If using polling mode, the bot should start processing messages. If using webhook mode, ensure your webhook is correctly set with Telegram (the `deploy_server.sh` script attempts this if it finds webhook variables in `.env`, but `bot.py` also tries on startup).
369 | 
370 | ## Deploying to Google Cloud Run
371 | 
372 | This guide assumes you have a GCP account, `gcloud` CLI installed and configured, and Docker installed.
373 | 
374 | The deployment process involves:
375 | 1.  **Setting up Secrets:** Securely store your API keys and tokens in Google Cloud Secret Manager.
376 | 2.  **Building & Pushing Image:** Build the Docker image and push it to Google Artifact Registry.
377 | 3.  **Deploying Service:** Deploy the image to Cloud Run, mapping the secrets to environment variables.
378 | 4.  **Setting Webhook:** Configure the Telegram webhook to point to your Cloud Run service URL.
379 | 
380 | We have provided scripts to streamline this process.
381 | 
382 | ### 1. Setup Secrets
383 | 
384 | This script helps you create secrets in Google Cloud Secret Manager and add your sensitive values (API keys, tokens).
385 | 
386 | **IMPORTANT:** Before running, you **must** edit the `SECRETS` array inside `scripts/setup_secrets.sh` to include the *exact names* of the environment variables defined in your `.env` file (e.g., `TELEGRAM_BOT_TOKEN`, `TAVILY_API_KEY`, `TWITTER_API_IO_KEY`, `GEMINI_API_KEY` etc.).
387 | 
388 | ```bash
389 | # Make the script executable (only needed once)
390 | chmod +x ./scripts/setup_secrets.sh
391 | 
392 | # Run the secret setup script (it will prompt for project ID and secret values)
393 | ./scripts/setup_secrets.sh
394 | ```
395 | 
396 | Follow the prompts to enter your GCP Project ID (if not already configured) and the values for each secret.
397 | 
398 | ### 2. Deploy to Cloud Run
399 | 
400 | This script automates building the image, pushing it to Artifact Registry, deploying to Cloud Run, and setting the Telegram webhook.
401 | 
402 | **IMPORTANT:** Before running, you **must** edit the `SECRETS_TO_MAP` array inside `scripts/deploy_cloud_run.sh`. This array defines how the secrets you created map to environment variables in your Cloud Run service. Ensure the secret names match those used in `setup_secrets.sh` (e.g., `TWITTER_API_IO_KEY=twitter-api-io-key-secret-name:latest`).
403 | 
404 | ```bash
405 | # Make the script executable (only needed once)
406 | chmod +x ./scripts/deploy_cloud_run.sh
407 | 
408 | # Run the deployment script (it will prompt for configuration)
409 | ./scripts/deploy_cloud_run.sh
410 | ```
411 | 
412 | The script will prompt you for your GCP Project ID, Region, Service Name, and Artifact Registry Repository Name if they are not set as environment variables. It will then guide you through the build, push, and deployment process, including setting the Telegram webhook automatically if it can find your `TELEGRAM_BOT_TOKEN` secret mapping.
413 | 
414 | ### Manual Steps (If needed)
415 | 
416 | <details>
417 | <summary>Click to view manual gcloud commands</summary>
418 | 
419 | 1.  **Set Environment Variables (Shell):**
420 |     ```bash
421 |     export PROJECT_ID="your-gcp-project-id"
422 |     export REGION="your-preferred-region" # e.g., us-central1
423 |     export SERVICE_NAME="telegram-summarizer"
424 |     export REPO_NAME="my-summarizer-bot-repo" # Or your preferred Artifact Registry repo name
425 |     export IMAGE_NAME="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO_NAME}/${SERVICE_NAME}:latest"
426 | 
427 |     gcloud config set project $PROJECT_ID
428 |     gcloud config set run/region $REGION
429 |     ```
430 | 2.  **Enable Required APIs:**
431 |     ```bash
432 |     gcloud services enable run.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com secretmanager.googleapis.com
433 |     ```
434 | 3.  **Create Artifact Registry Repository (if needed):**
435 |     ```bash
436 |     gcloud artifacts repositories create $REPO_NAME \
437 |       --repository-format=docker \
438 |       --location=$REGION \
439 |       --description="Docker repository for bots"
440 |     ```
441 | 4.  **Configure Docker Authentication:**
442 |     ```bash
443 |     gcloud auth configure-docker ${REGION}-docker.pkg.dev
444 |     ```5.  **Manage Secrets with Secret Manager (Recommended):**
445 |     Store API keys and tokens securely using Google Cloud Secret Manager. Use the `gcloud` CLI (as done by `setup_secrets.sh`):
446 | 
447 |     *   **Create Secret:** (Example: `twitter-api-io-key` for the twitterapi.io key)
448 |         ```bash
449 |         gcloud secrets create twitter-api-io-key --replication-policy="automatic"
450 |         # Add others like tavily-api-key, telegram-bot-token, gemini-api-key, etc.
451 |         ```
452 | 
453 |     *   **Add Secret Version:**
454 |         ```bash
455 |         printf "YOUR_ACTUAL_TWITTERAPI_IO_KEY" | gcloud secrets versions add twitter-api-io-key --data-file=-
456 |         # Add versions for other secrets...
457 |         ```
458 | 
459 | 6.  **Build and Push Docker Image:**
460 |     ```bash
461 |     # Build
462 |     docker build -t $IMAGE_NAME .
463 |     # Push
464 |     docker push $IMAGE_NAME
465 |     ```
466 | 
467 | 7.  **Deploy to Cloud Run:**
468 |     Replace `SECRET_NAME=SECRET_ID:latest,...` with your actual secret mappings, including `TWITTER_API_IO_KEY`.
469 |     ```bash
470 |     gcloud run deploy $SERVICE_NAME \
471 |       --image $IMAGE_NAME \
472 |       --platform managed \
473 |       --region $REGION \
474 |       --port 8080 \
475 |       --allow-unauthenticated \
476 |       --set-secrets=TELEGRAM_BOT_TOKEN=telegram-bot-token:latest,TAVILY_API_KEY=tavily-api-key:latest,GEMINI_API_KEY=gemini-api-key:latest,TWITTER_API_IO_KEY=twitter-api-io-key:latest,TELEGRAM_WEBHOOK_SECRET_TOKEN=webhook-secret-token:latest
477 |       # Adjust secret names (e.g., twitter-api-io-key, webhook-secret-token) and versions as needed
478 |     ```
479 | 
480 | 8.  **Get Service URL & Set Telegram Webhook:**
481 |     ```bash
482 |     # Get the URL
483 |     SERVICE_URL=$(gcloud run services describe $SERVICE_NAME --platform managed --region $REGION --format 'value(status.url)')
484 |     echo "Service URL: $SERVICE_URL"
485 | 
486 |     # Get your bot token (replace secret-id if different)
487 |     TELEGRAM_BOT_TOKEN=$(gcloud secrets versions access latest --secret=telegram-bot-token)
488 |     # Get your webhook secret (optional, replace secret-id if different)
489 |     WEBHOOK_SECRET=$(gcloud secrets versions access latest --secret=telegram-webhook-secret-token)
490 | 
491 |     # Get your webhook path (replace secret-id if different)
492 |     WEBHOOK_SECRET_PATH_VAL=$(gcloud secrets versions access latest --secret=webhook-secret-path) # Assuming you stored it
493 |     # Get your webhook secret token (replace secret-id if different)
494 |     WEBHOOK_SECRET_TOKEN_VAL=$(gcloud secrets versions access latest --secret=webhook-secret-token) 
495 | 
496 |     curl -F "url=${SERVICE_URL}${WEBHOOK_SECRET_PATH_VAL}" \
497 |          -F "secret_token=${WEBHOOK_SECRET_TOKEN_VAL}" \
498 |          https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook
499 |     ```
500 | 
501 | </details>
502 | 
503 | ## License
504 | 
505 | This project is licensed under the Apache License 2.0. See the [LICENSE](LICENSE) file for details.
506 | 
507 | ---
508 | 
509 | **Stay updated:** [Join the Telegram Channel @tech_links](https://t.me/tech_links) for the latest news and interesting links. If you find this project useful, please consider starring the repository.
510 | 
511 | ## Star History
512 | 
513 | [![Star History Chart](https://api.star-history.com/svg?repos=kargarisaac/telegram_link_summarizer_agent&type=Date)](https://www.star-history.com/#kargarisaac/telegram_link_summarizer_agent&Date)


--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import logging  # Added for youtube scraper logging visibility
  4 | from typing import Any, Dict, TypedDict, Union, Optional
  5 | 
  6 | from baml_client import b
  7 | from baml_client.types import ContentType, Summary, ExtractorTool
  8 | from dotenv import load_dotenv
  9 | 
 10 | from langgraph.graph import StateGraph, END
 11 | from rich.console import Console
 12 | from tools.search import run_tavily_tool
 13 | from tools.pdf_handler import get_pdf_text
 14 | from tools.twitter_api_tool import fetch_tweet_thread
 15 | from tools.linkedin_agentql_scraper import (
 16 |     scrape_linkedin_post as scrape_linkedin_post_agentql,
 17 | )
 18 | from tools.youtube_agentql_scraper import scrape_youtube as scrape_youtube_agentql
 19 | 
 20 | load_dotenv()
 21 | 
 22 | console = Console()
 23 | 
 24 | # Configure logging slightly for better visibility from tools
 25 | logging.basicConfig(
 26 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s"
 27 | )
 28 | logging.getLogger("httpx").setLevel(logging.WARNING)  # Reduce noise from http libraries
 29 | logging.getLogger("googleapiclient.discovery_cache").setLevel(logging.ERROR)
 30 | 
 31 | # --- LangGraph Agent State ---
 32 | 
 33 | 
 34 | class AgentState(TypedDict):
 35 |     original_message: str
 36 |     url: str
 37 |     content_type: ContentType  # 'web' or 'pdf' or others?
 38 |     content: str
 39 |     summary: str
 40 |     error: Optional[str]
 41 |     route_decision: Optional[str]  # To store the routing decision string
 42 |     needs_web_fallback: bool  # Flag for YouTube fallback
 43 | 
 44 | 
 45 | # --- Define Graph Nodes ---
 46 | 
 47 | 
 48 | def init_state(state: AgentState) -> Dict[str, Any]:
 49 |     """Extracts the URL from the original message."""
 50 |     console.print("---INIT STATE---", style="yellow bold")
 51 |     message = state["original_message"]
 52 |     # Basic URL extraction (consider a more robust regex)
 53 |     url = next((word for word in message.split() if word.startswith("http")), None)
 54 |     error = None if url else "No URL found in the message."
 55 | 
 56 |     if error:
 57 |         console.print(f"Initialization error: {error}", style="red")
 58 | 
 59 |     return {
 60 |         "original_message": message,
 61 |         "url": url if url else "",  # Ensure url is always a string
 62 |         "content_type": ContentType.Webpage,  # Default, gets updated later
 63 |         "content": "",
 64 |         "summary": "",
 65 |         "error": error,
 66 |         "route_decision": None,
 67 |         "needs_web_fallback": False,  # Initialize flag
 68 |     }
 69 | 
 70 | 
 71 | async def llm_router(state: AgentState) -> Dict[str, Any]:
 72 |     """Determines the content extraction route using the BAML LLM Router."""
 73 |     console.print("---LLM ROUTER (BAML)--- ", style="yellow bold")
 74 | 
 75 |     # If init failed, pass the error along
 76 |     if state.get("error"):
 77 |         console.print(
 78 |             f"Skipping LLM Router due to init error: {state['error']}", style="red"
 79 |         )
 80 |         return {"error": state["error"], "route_decision": "__error__"}
 81 | 
 82 |     message = state["original_message"]
 83 |     decision = "__error__"  # Default to error
 84 |     routing_error = None
 85 | 
 86 |     try:
 87 |         console.print(
 88 |             f"Calling BAML RouteRequest for: '{message[:50]}...'", style="cyan"
 89 |         )
 90 |         # Call the BAML function (synchronously, as it's not declared async in BAML)
 91 |         route_result: ExtractorTool = b.RouteRequest(original_message=message)
 92 | 
 93 |         console.print(f"LLM Router returned: {route_result}", style="green")
 94 | 
 95 |         # Map the enum result to string for routing
 96 |         if route_result == ExtractorTool.WebpageExtractor:
 97 |             decision = "web_extractor"
 98 |         elif route_result == ExtractorTool.PDFExtractor:
 99 |             decision = "pdf_extractor"
100 |         elif route_result == ExtractorTool.TwitterExtractor:
101 |             decision = "twitter_extractor"
102 |         elif route_result == ExtractorTool.LinkedInExtractor:
103 |             decision = "linkedin_extractor"
104 |         elif route_result == ExtractorTool.YoutubeExtractor:
105 |             decision = "youtube_extractor"  # Added Youtube route
106 |         elif route_result == ExtractorTool.Unsupported:
107 |             decision = "__unsupported__"
108 |             routing_error = "Unsupported URL type or no URL found by LLM Router."
109 |             console.print(routing_error, style="yellow")
110 |         else:
111 |             # Should not happen if enum is handled correctly
112 |             decision = "__error__"
113 |             routing_error = f"LLM Router returned an unexpected value: {route_result}"
114 |             console.print(routing_error, style="red")
115 | 
116 |     except Exception as e:
117 |         console.print(f"Error calling BAML RouteRequest: {e}", style="red bold")
118 |         routing_error = f"LLM Router failed: {e}"
119 |         decision = "__error__"
120 | 
121 |     # Update the state dictionary
122 |     return {
123 |         "route_decision": decision,
124 |         "error": routing_error,  # Overwrite previous error state if routing fails
125 |     }
126 | 
127 | 
128 | def get_web_content(state: AgentState) -> Dict[str, Any]:
129 |     """Fetches content from a standard webpage URL using Tavily extract."""
130 |     console.print("---GET WEB CONTENT (Tavily Extract)--- ", style="yellow bold")
131 |     url = state["url"]
132 |     error_message = None
133 |     content_source = ""
134 |     content_type = ContentType.Webpage
135 | 
136 |     # Reset error from previous steps if any
137 |     state["error"] = None
138 |     # Reset fallback flag if we reached here directly or as fallback
139 |     state["needs_web_fallback"] = False
140 | 
141 |     try:
142 |         # Use Tavily extract for non-Twitter URLs
143 |         console.print(f"Using Tavily extract for: {url}", style="cyan")
144 |         extract_tool_results = run_tavily_tool(mode="extract", urls=[url])
145 |         results_list = extract_tool_results.get("results", [])
146 |         failed_results = extract_tool_results.get("failed_results", [])
147 | 
148 |         if results_list:
149 |             for res in results_list:
150 |                 # Try to get 'raw_content' first, fallback to 'content'
151 |                 raw_content = res.get("raw_content")
152 |                 if not raw_content:
153 |                     raw_content = res.get(
154 |                         "content", ""
155 |                     )  # Fallback if raw_content is missing
156 | 
157 |                 if raw_content:  # Only add if content exists
158 |                     content_source += f"URL: {res.get('url', 'N/A')}\n"
159 |                     content_source += f"Raw Content: {raw_content}\n\n"
160 |                 # Optional: Include images if needed later
161 |                 # content_source += f"Images: {res.get('images', [])}\n"
162 | 
163 |         if failed_results:
164 |             error_message = (
165 |                 f"Tavily failed to extract content from: {', '.join(failed_results)}"
166 |             )
167 |             console.print(error_message, style="red")
168 |             # If extraction failed entirely and we have no content, set content_source empty
169 |             if not content_source:
170 |                 content_source = ""
171 | 
172 |         # If after trying extract, we still have no content and no specific error, set a generic one
173 |         if not content_source and not error_message:
174 |             error_message = "Tavily extract did not return any content for the URL."
175 |             console.print(error_message, style="red")
176 | 
177 |     except Exception as e:
178 |         console.print(f"Error getting content from URL {url}: {e}", style="red bold")
179 |         error_message = f"Error: An unexpected error occurred while getting content from the URL. {e}"
180 |         content_source = ""  # Ensure content is empty on error
181 | 
182 |     return {
183 |         # **state, # Don't spread the entire state, just update relevant fields
184 |         "content_type": content_type,
185 |         "content": content_source.strip(),  # Strip leading/trailing whitespace
186 |         "error": error_message,
187 |         "needs_web_fallback": False,  # Explicitly set to false after web extraction
188 |     }
189 | 
190 | 
191 | def get_twitter_content(state: AgentState) -> Dict[str, Any]:
192 |     """Fetches content from a Twitter/X URL using twitter_api_tool."""
193 |     console.print("---GET TWITTER/X CONTENT (twitterapi.io)--- ", style="yellow bold")
194 |     url = state["url"]
195 |     error_message = None
196 |     content_result = ""
197 |     content_type = ContentType.Webpage
198 | 
199 |     # Reset error from previous steps if any
200 |     state["error"] = None
201 |     state["needs_web_fallback"] = False  # Reset flag
202 | 
203 |     try:
204 |         console.print(f"Fetching tweet thread for URL: {url}", style="cyan")
205 |         # Use the new tool
206 |         content_result = fetch_tweet_thread(url)
207 | 
208 |         # Check if the tool returned an error message
209 |         if isinstance(content_result, str) and content_result.startswith("Error:"):
210 |             error_message = content_result
211 |             console.print(error_message, style="red bold")
212 |             content_result = ""  # Ensure content is empty if tool errored
213 |         elif not content_result:  # Handle empty success case
214 |             error_message = "Twitter tool returned no content."
215 |             console.print(error_message, style="yellow")
216 |             content_result = ""
217 |         else:
218 |             console.print(
219 |                 f"Successfully fetched Twitter content for: {url}", style="green"
220 |             )
221 |             # Ensure content_result is a string
222 |             if not isinstance(content_result, str):
223 |                 content_result = str(content_result)
224 | 
225 |     except Exception as e:
226 |         console.print(
227 |             f"Unexpected error calling fetch_tweet_thread for {url}: {e}",
228 |             style="red bold",
229 |         )
230 |         error_message = (
231 |             f"Error: An unexpected error occurred while calling the Twitter tool. {e}"
232 |         )
233 |         content_result = ""
234 | 
235 |     return {
236 |         # **state,
237 |         "content_type": content_type,
238 |         "content": content_result.strip(),
239 |         "error": error_message,
240 |         "needs_web_fallback": False,
241 |     }
242 | 
243 | 
244 | def get_linkedin_content(state: AgentState) -> Dict[str, Any]:
245 |     """Fetches content from a LinkedIn post URL using linkedin_scraper_tool."""
246 |     console.print(
247 |         "---GET LINKEDIN CONTENT (linkedin_scraper_tool)--- ", style="yellow bold"
248 |     )
249 |     url = state["url"]
250 |     error_message = None
251 |     content_result = ""
252 |     content_type = (
253 |         ContentType.Webpage
254 |     )  # LinkedIn posts are treated as webpages for summarization
255 | 
256 |     # Reset error from previous steps if any
257 |     state["error"] = None
258 |     state["needs_web_fallback"] = False  # Reset flag
259 | 
260 |     try:
261 |         console.print(f"Fetching LinkedIn post content for URL: {url}", style="cyan")
262 |         # Use the LinkedIn tool (AgentQL version)
263 |         result = scrape_linkedin_post_agentql(
264 |             url, headless=True
265 |         )  # Call with headless=True
266 | 
267 |         # AgentQL scraper returns a dict: {"author": "...", "content": "..."}
268 |         if isinstance(result, dict) and result.get("content"):
269 |             content_result = result["content"]
270 |             # author = result.get("author") # Author is available if needed later
271 |             console.print(
272 |                 f"Successfully fetched LinkedIn content (AgentQL) for: {url}",
273 |                 style="green",
274 |             )
275 |             if not isinstance(content_result, str):
276 |                 content_result = str(content_result)
277 |         elif (
278 |             isinstance(result, dict) and "error" in result
279 |         ):  # Check for an error key if scraper returns errors that way
280 |             error_message = (
281 |                 f"LinkedIn AgentQL scraper returned an error: {result['error']}"
282 |             )
283 |             console.print(error_message, style="red bold")
284 |             content_result = ""
285 |         else:
286 |             error_message = f"LinkedIn AgentQL scraper returned unexpected result or no content: {result}"
287 |             console.print(error_message, style="yellow")
288 |             content_result = ""
289 | 
290 |     except Exception as e:
291 |         console.print(
292 |             f"Unexpected error calling scrape_linkedin_post for {url}: {e}",
293 |             style="red bold",
294 |         )
295 |         error_message = (
296 |             f"Error: An unexpected error occurred while calling the LinkedIn tool. {e}"
297 |         )
298 |         content_result = ""
299 | 
300 |     return {
301 |         # **state,
302 |         "content_type": content_type,
303 |         "content": content_result.strip(),
304 |         "error": error_message,
305 |         "needs_web_fallback": False,
306 |     }
307 | 
308 | 
309 | def get_youtube_content(state: AgentState) -> Dict[str, Any]:
310 |     """Fetches content (description/transcript) using youtube_scraper with fallbacks."""
311 |     console.print(
312 |         "---GET YOUTUBE CONTENT (yt-dlp + Fallbacks)--- ", style="yellow bold"
313 |     )
314 |     url = state["url"]
315 |     error_message = None
316 |     content_result = ""
317 |     # For YouTube, let's treat the content type as Webpage for the summarizer initially
318 |     content_type = ContentType.Webpage
319 | 
320 |     # Reset error and fallback flag from previous steps if any
321 |     state["error"] = None
322 |     # No longer using needs_web_fallback with AgentQL direct approach
323 |     # state["needs_web_fallback"] = False
324 | 
325 |     try:
326 |         console.print(f"Fetching YouTube info for URL (AgentQL): {url}", style="cyan")
327 |         # Use the YouTube AgentQL tool
328 |         result = scrape_youtube_agentql(url, headless=True)  # Call with headless=True
329 | 
330 |         # AgentQL scraper returns: {"title": "...", "description": "..."}
331 |         if isinstance(result, dict) and (
332 |             result.get("title") or result.get("description")
333 |         ):
334 |             title = result.get("title", "")
335 |             description = result.get("description", "")
336 |             content_result = f"Title: {title}\n\nDescription:\n{description}".strip()
337 |             console.print(
338 |                 f"Successfully fetched YouTube content (AgentQL) for: {url}",
339 |                 style="green",
340 |             )
341 |             error_message = None
342 |         elif (
343 |             isinstance(result, dict) and "error" in result
344 |         ):  # If scraper returns dict with error
345 |             error_message = (
346 |                 f"YouTube AgentQL scraper returned an error: {result['error']}"
347 |             )
348 |             console.print(error_message, style="red bold")
349 |             content_result = ""
350 |         else:
351 |             error_message = f"YouTube AgentQL scraper returned unexpected result or no content: {result}"
352 |             console.print(error_message, style="yellow")
353 |             content_result = ""
354 | 
355 |     except Exception as e:
356 |         console.print(
357 |             f"Unexpected error calling scrape_youtube_agentql for {url}: {e}",
358 |             style="red bold",
359 |         )
360 |         error_message = f"Error: An unexpected error occurred while calling the YouTube AgentQL tool. {e}"
361 |         content_result = ""
362 |         # needs_fallback = False # Not used anymore
363 | 
364 |     return {
365 |         # **state,
366 |         "content_type": content_type,
367 |         "content": content_result.strip(),
368 |         "error": error_message,  # Will be None if successful
369 |         "needs_web_fallback": False,  # Explicitly set to false, not used for fallback anymore
370 |     }
371 | 
372 | 
373 | def handle_pdf_content(state: AgentState) -> Dict[str, Any]:
374 |     """Downloads and extracts text from a PDF URL."""
375 |     console.print("---HANDLE PDF CONTENT--- ", style="bold yellow")
376 |     url = state["url"]
377 |     error_message = None
378 |     pdf_text = ""
379 | 
380 |     # Reset error from previous steps if any
381 |     state["error"] = None
382 |     state["needs_web_fallback"] = False  # Reset flag
383 | 
384 |     try:
385 |         extracted_text = get_pdf_text(url)
386 |         if isinstance(extracted_text, str) and extracted_text.startswith("Error:"):
387 |             console.print(
388 |                 f"Error getting PDF content: {extracted_text}", style="red bold"
389 |             )
390 |             error_message = extracted_text
391 |         elif not extracted_text:
392 |             error_message = "PDF extraction returned no text."
393 |             console.print(error_message, style="yellow")
394 |         else:
395 |             console.print(
396 |                 f"Successfully extracted text from PDF: {url}", style="magenta"
397 |             )
398 |             pdf_text = extracted_text
399 |             # Ensure text is string
400 |             if not isinstance(pdf_text, str):
401 |                 pdf_text = str(pdf_text)
402 | 
403 |     except Exception as e:
404 |         console.print(f"Unexpected error handling PDF {url}: {e}", style="red bold")
405 |         error_message = (
406 |             f"Error: An unexpected error occurred while processing the PDF. {e}"
407 |         )
408 | 
409 |     return {
410 |         # **state,
411 |         "content": pdf_text.strip(),
412 |         "content_type": ContentType.PDF,
413 |         "error": error_message,
414 |         "needs_web_fallback": False,
415 |     }
416 | 
417 | 
418 | async def summarize_content(state: AgentState) -> Dict[str, Any]:
419 |     """Summarizes the extracted content using BAML."""
420 |     console.print("---SUMMARIZE CONTENT--- ", style="bold green")
421 | 
422 |     content_to_summarize = state.get("content")
423 | 
424 |     # If there was an error *before* summarization, don't proceed
425 |     if state.get("error"):
426 |         console.print(
427 |             f"Skipping summarization due to previous error: {state['error']}",
428 |             style="yellow",
429 |         )
430 |         return {"summary": "", "error": state["error"]}  # Keep existing error
431 | 
432 |     if not content_to_summarize or content_to_summarize.strip() == "":
433 |         console.print("No content available to summarize.", style="yellow")
434 |         # If we reached here due to an upstream error, preserve it
435 |         # Otherwise, set an error indicating no content.
436 |         final_error = state.get("error") or "No content found to summarize."
437 |         return {
438 |             "summary": "",
439 |             "error": final_error,
440 |         }
441 | 
442 |     url = state.get("url", "Unknown URL")
443 |     summarization_error = None
444 |     formatted_summary = ""
445 | 
446 |     try:
447 |         console.print(
448 |             f"--- Debug: Summarizing {len(content_to_summarize)} chars --- ",
449 |             style="dim",
450 |         )
451 |         # Ensure content_type is valid, default to Webpage if missing/invalid
452 |         content_type = state.get("content_type", ContentType.Webpage)
453 |         if not isinstance(content_type, ContentType):
454 |             content_type = ContentType.Webpage  # Default fallback
455 | 
456 |         # Call the BAML function (assuming it's synchronous based on definition)
457 |         summary_result: Summary = b.SummarizeContent(
458 |             content=content_to_summarize,
459 |             content_type=content_type,
460 |             context=state.get("original_message", ""),
461 |         )
462 |         console.print(f"Successfully generated summary.", style="bold green")
463 |         title = getattr(summary_result, "title", "Summary")  # Default title
464 |         key_points = getattr(summary_result, "key_points", [])
465 |         concise_summary = getattr(
466 |             summary_result,
467 |             "concise_summary",
468 |             "Summarization service returned an unexpected response format.",
469 |         )
470 | 
471 |         # Ensure parts are strings
472 |         title = str(title) if title else "Summary"
473 |         key_points = [str(p).strip() for p in key_points if p]
474 |         concise_summary = (
475 |             str(concise_summary).strip() if concise_summary else "No summary generated."
476 |         )
477 | 
478 |         formatted_summary = f"# {title}\n\n"
479 |         if key_points:
480 |             formatted_summary += "## Key Points:\n"
481 |             for point in key_points:
482 |                 formatted_summary += f"- {point}\n"
483 |             formatted_summary += "\n"  # Add space before summary
484 |         formatted_summary += f"## Summary:\n{concise_summary}"
485 |         formatted_summary = re.sub(r"\n\s*\n", "\n\n", formatted_summary).strip()
486 | 
487 |         # Clear any previous error if summarization succeeds
488 |         summarization_error = None
489 | 
490 |     except Exception as e:
491 |         console.print(f"Error during summarization for {url}: {e}", style="red bold")
492 |         print(f"--- Debug: BAML summarization error: {e} ---")
493 |         summarization_error = f"Summarization failed: {e}"
494 |         formatted_summary = ""  # Ensure summary is empty on error
495 | 
496 |     # Return only summary and error, let graph manage state merge
497 |     return {
498 |         "summary": formatted_summary,
499 |         "error": summarization_error,  # Overwrite previous errors only if summarization fails
500 |     }
501 | 
502 | 
503 | # --- Conditional Edges Logic ---
504 | 
505 | 
506 | def route_based_on_llm(state: AgentState) -> str:
507 |     """Routes to the appropriate extractor based on the LLM router decision."""
508 |     console.print("---ROUTING (LLM Decision)--- ", style="yellow bold")
509 |     decision = state.get("route_decision")
510 |     error = state.get("error")  # Check for errors from init or router node
511 | 
512 |     if error:
513 |         console.print(f"Routing to END due to error: {error}", style="red")
514 |         return END
515 | 
516 |     if decision == "web_extractor":
517 |         console.print(f"LLM Routed to: Web Extractor", style="magenta")
518 |         return "web_extractor"
519 |     elif decision == "pdf_extractor":
520 |         console.print(f"LLM Routed to: PDF Extractor", style="magenta")
521 |         return "pdf_extractor"
522 |     elif decision == "twitter_extractor":
523 |         console.print(f"LLM Routed to: Twitter Extractor", style="magenta")
524 |         return "twitter_extractor"
525 |     elif decision == "linkedin_extractor":
526 |         console.print(f"LLM Routed to: LinkedIn Extractor", style="magenta")
527 |         return "linkedin_extractor"
528 |     elif decision == "youtube_extractor":
529 |         console.print(f"LLM Routed to: YouTube Extractor", style="magenta")
530 |         return "youtube_extractor"  # Added Youtube route
531 |     elif decision == "__unsupported__":
532 |         console.print("LLM Routed to: Unsupported -> END", style="yellow")
533 |         # Error message should already be set by the router node
534 |         return END
535 |     else:  # Includes __error__ or unexpected values
536 |         console.print(
537 |             f"LLM Routing decision invalid or error ('{decision}'). Routing to END.",
538 |             style="red",
539 |         )
540 |         # Ensure error state reflects this if not already set
541 |         current_error = state.get("error")
542 |         if not current_error:
543 |             # Update state directly is tricky in conditional functions.
544 |             # Ideally, the router node should set the error if decision is __error__.
545 |             # For now, just log and route to end.
546 |             console.print(
547 |                 f"Setting error state due to invalid routing: {decision}", style="red"
548 |             )
549 |             # state["error"] = f"Invalid routing decision: {decision}"
550 |         return END
551 | 
552 | 
553 | def should_summarize(state: AgentState) -> str:
554 |     """Determines whether to proceed to summarization or end after extraction."""
555 |     content = state.get("content")
556 |     error = state.get("error")  # Check error from the *extractor* node
557 |     has_content = content and isinstance(content, str) and content.strip() != ""
558 |     # needs_fallback is no longer used for YouTube -> Webpage fallback
559 |     # if needs_fallback:
560 |     #     console.print(
561 |     #         "Routing after Extraction: YouTube fallback failed, routing to Web Extractor.",
562 |     #         style="yellow",
563 |     #     )
564 |     #     return "web_extractor"  # Route to web extractor as the last resort
565 | 
566 |     if error:
567 |         console.print(
568 |             f"Routing after Extraction: Error occurred ('{error}'), routing to END.",
569 |             style="red",
570 |         )
571 |         return END
572 |     elif has_content:
573 |         console.print(
574 |             "Routing after Extraction: Content extracted successfully, routing to Summarize.",
575 |             style="green",
576 |         )
577 |         return "summarize_content"
578 |     else:
579 |         console.print(
580 |             "Routing after Extraction: No content extracted and no specific error, routing to END.",
581 |             style="yellow",
582 |         )
583 |         # Set an error if none exists from the extractor
584 |         current_error = state.get("error")
585 |         final_error = current_error or "Content extraction finished with no content."
586 |         # state["error"] = final_error # Avoid direct state modification here
587 |         console.print(f"Setting error state: {final_error}", style="yellow")
588 |         # How to set error state correctly before END?
589 |         # LangGraph merges the partial state returned by the node *after* the edge logic.
590 |         # We might need an explicit error handling node.
591 |         # For now, just route to END. The final state check should catch the lack of summary.
592 |         return END
593 | 
594 | 
595 | # --- Build the Graph ---
596 | 
597 | 
598 | def build_graph():
599 |     workflow = StateGraph(AgentState)
600 | 
601 |     # Add nodes
602 |     workflow.add_node("init", init_state)
603 |     workflow.add_node("llm_router", llm_router)  # New router node
604 |     workflow.add_node("web_extractor", get_web_content)
605 |     workflow.add_node("pdf_extractor", handle_pdf_content)
606 |     workflow.add_node("twitter_extractor", get_twitter_content)
607 |     workflow.add_node("linkedin_extractor", get_linkedin_content)
608 |     workflow.add_node("youtube_extractor", get_youtube_content)  # Add new node
609 |     workflow.add_node("summarize_content", summarize_content)
610 | 
611 |     # Define edges
612 |     workflow.set_entry_point("init")
613 | 
614 |     # Edge from init to the LLM router
615 |     workflow.add_edge("init", "llm_router")
616 | 
617 |     # Conditional routing based on LLM Router output
618 |     workflow.add_conditional_edges(
619 |         "llm_router",
620 |         route_based_on_llm,
621 |         {
622 |             "web_extractor": "web_extractor",
623 |             "pdf_extractor": "pdf_extractor",
624 |             "twitter_extractor": "twitter_extractor",
625 |             "linkedin_extractor": "linkedin_extractor",
626 |             "youtube_extractor": "youtube_extractor",  # Add edge to new node
627 |             END: END,  # Handles errors and unsupported cases from the router
628 |         },
629 |     )
630 | 
631 |     # Route from each extractor to the summarization check
632 |     # Note: The should_summarize function now handles routing to web_extractor for YouTube fallback
633 |     workflow.add_conditional_edges(
634 |         "web_extractor",
635 |         should_summarize,
636 |         {
637 |             "summarize_content": "summarize_content",
638 |             END: END,
639 |             # No web_extractor fallback from web_extractor itself
640 |         },
641 |     )
642 |     workflow.add_conditional_edges(
643 |         "pdf_extractor",
644 |         should_summarize,
645 |         {
646 |             "summarize_content": "summarize_content",
647 |             END: END,
648 |             # No web_extractor fallback needed from pdf
649 |         },
650 |     )
651 |     workflow.add_conditional_edges(
652 |         "twitter_extractor",
653 |         should_summarize,
654 |         {
655 |             "summarize_content": "summarize_content",
656 |             END: END,
657 |             # No web_extractor fallback needed from twitter
658 |         },
659 |     )
660 |     workflow.add_conditional_edges(
661 |         "linkedin_extractor",
662 |         should_summarize,
663 |         {
664 |             "summarize_content": "summarize_content",
665 |             END: END,
666 |             # No web_extractor fallback needed from linkedin
667 |         },
668 |     )
669 |     workflow.add_conditional_edges(
670 |         "youtube_extractor",  # Edges from YouTube extractor
671 |         should_summarize,  # Use the same logic function, now enhanced
672 |         {
673 |             "summarize_content": "summarize_content",
674 |             END: END,
675 |         },
676 |     )
677 | 
678 |     # Summarizer always goes to end
679 |     workflow.add_edge("summarize_content", END)
680 | 
681 |     return workflow.compile()
682 | 
683 | 
684 | graph = build_graph()
685 | 
686 | # --- Main Agent Function ---
687 | 
688 | 
689 | async def run_agent(message: str) -> Union[str, None]:
690 |     """
691 |     Runs the LangGraph agent workflow for URL summarization using an LLM router.
692 | 
693 |     Args:
694 |         message: The original message potentially containing a URL.
695 | 
696 |     Returns:
697 |         - str: Summary text on successful extraction and summarization.
698 |         - str: An error message string if a significant error occurred.
699 |         - None: Should ideally not be returned if error handling is robust.
700 |     """
701 |     inputs = {"original_message": message}
702 |     final_state = None
703 |     try:
704 |         # Use graph.astream for async execution
705 |         async for output in graph.astream(
706 |             inputs, {"recursion_limit": 15}
707 |         ):  # Increased recursion limit
708 |             # output is a dictionary where keys are node names and values are states after the node ran
709 |             # We are interested in the state *after* the last node executes
710 |             node_name = list(output.keys())[0]
711 |             final_state = output[node_name]  # Keep track of the latest state
712 |             console.print(f"Output from node '{node_name}': Updated state", style="dim")
713 |             # Optional: Print intermediate state details if needed for debugging
714 |             # console.print(f"  State keys: {list(final_state.keys())}", style="dim")
715 | 
716 |         if final_state:
717 |             # Debug: Print the final state (simplified)
718 |             console.print("---FINAL STATE--- ", style="bold magenta")
719 |             # Sort keys for consistent output order
720 |             state_keys = sorted(final_state.keys())
721 |             for key in state_keys:
722 |                 value = final_state[key]
723 |                 if key == "content" and isinstance(value, str) and len(value) > 200:
724 |                     console.print(
725 |                         f"  {key}: <string> ({len(value)} chars)", style="magenta"
726 |                     )
727 |                 elif isinstance(value, str) and len(value) > 100:
728 |                     console.print(
729 |                         f"  {key}: <string> ({len(value)} chars)", style="magenta"
730 |                     )
731 |                 else:
732 |                     console.print(f"  {key}: {value}", style="magenta")
733 | 
734 |             # Determine final result based on summary and error fields
735 |             summary_text = final_state.get("summary")
736 |             final_error = final_state.get("error")
737 | 
738 |             # 1. Successful Summary (even if there were intermediate, recoverable errors)
739 |             if summary_text and isinstance(summary_text, str) and summary_text.strip():
740 |                 console.print("---AGENT FINISHED: Summary--- ", style="bold green")
741 |                 # If an error occurred *before* summarization, but summarization *still* happened
742 |                 # (e.g. fallback content used), we might want to mention the error.
743 |                 # For now, prioritize showing the summary if available.
744 |                 # if final_error:
745 |                 #     console.print(f"(Note: An earlier error occurred: {final_error})", style="yellow")
746 |                 return summary_text
747 | 
748 |             # 2. Error occurred (could be init, routing, extraction, or summarization error)
749 |             elif final_error:
750 |                 console.print(
751 |                     f"---AGENT FINISHED: Error ('{final_error}')--- ", style="bold red"
752 |                 )
753 |                 # Ensure the error message is prefixed consistently
754 |                 if isinstance(final_error, str) and final_error.lower().startswith(
755 |                     "error:"
756 |                 ):
757 |                     return final_error
758 |                 else:
759 |                     return "Error: " + str(final_error)  # Ensure it's a string
760 | 
761 |             # 3. No Summary and No Error (Should ideally not happen with should_summarize logic,
762 |             #    but could occur if summarizer returns empty without error)
763 |             else:
764 |                 console.print(
765 |                     "---AGENT FINISHED: No Summary/No Error--- ", style="bold yellow"
766 |                 )
767 |                 # Provide a more specific fallback message
768 |                 if not final_state.get("content"):
769 |                     # Check if it was an unsupported URL type initially
770 |                     if final_state.get("route_decision") == "__unsupported__":
771 |                         return "Error: The provided link type is not supported or no URL was found."
772 |                     else:
773 |                         return "Error: Agent finished without extracting content."
774 |                 else:
775 |                     return "Error: Agent finished. Content was extracted, but no summary was generated and no specific error was reported."
776 | 
777 |         else:
778 |             console.print("---AGENT FAILED: No Final State--- ", style="bold red")
779 |             return "Error: Agent workflow did not produce a final state."
780 | 
781 |     except Exception as e:
782 |         console.print("---AGENT FAILED: Runtime Exception--- ", style="bold red")
783 |         console.print_exception(show_locals=False)
784 |         # Ensure the exception is converted to a string for the return value
785 |         return "Error: An unexpected error occurred in the agent: " + str(e)
786 | 
787 | 
788 | # Example usage (for testing)
789 | if __name__ == "__main__":
790 |     import asyncio
791 | 
792 |     # --- Test Cases ---
793 |     # Twitter/X URL
794 |     test_url_msg_twitter = (
795 |         "Summarize this tweet: https://x.com/natolambert/status/1917928418068541520"
796 |     )
797 |     # Standard Web URL
798 |     test_url_msg_web = (
799 |         "Can you summarize this? https://lilianweng.github.io/posts/2023-06-23-agent/"
800 |     )
801 |     # PDF URL
802 |     test_url_msg_pdf = "Summarize: https://arxiv.org/pdf/2305.15334.pdf"
803 |     # URL that might fail primary extraction (Tavily might fail, but router should still pick web)
804 |     test_url_msg_fail = (
805 |         "What about this? https://httpbin.org/delay/5"  # Example, Tavily might timeout
806 |     )
807 |     # LinkedIn URL
808 |     test_url_msg_linkedin = "Summarize this post: https://www.linkedin.com/posts/omarsar_llms-for-engineering-activity-7324064951734603776-Ravc?utm_source=share&utm_medium=member_desktop&rcm=ACoAABDFOm0BmXlu4cLYtJePo0mLzdFoB5itUNU"
809 |     # Message without a URL (Router should pick Unsupported)
810 |     test_url_msg_nourl = "Hello, how are you?"
811 |     # Unsupported URL Type (Router should pick Unsupported)
812 |     test_url_msg_unsupported = "Check this out: ftp://files.example.com/data.zip"
813 |     # YouTube URL (Router should pick Youtube)
814 |     test_url_msg_youtube = "Summarize this video: https://www.youtube.com/watch?v=n5oBmmBkW6A"  # URL from youtube_scraper test
815 |     # YouTube URL that requires login (Should fallback to Tavily)
816 |     test_url_msg_youtube_login = (
817 |         "Summarize: https://www.youtube.com/watch?v=hhMXE9-JUAc"  # Test fallback
818 |     )
819 | 
820 |     async def main():
821 |         test_cases = {
822 |             # "Twitter": test_url_msg_twitter,
823 |             # "Web": test_url_msg_web,
824 |             # "PDF": test_url_msg_pdf,
825 |             # "Web Fail": test_url_msg_fail, # May take time
826 |             # "LinkedIn": test_url_msg_linkedin,
827 |             # "No URL": test_url_msg_nourl,
828 |             # "Unsupported FTP": test_url_msg_unsupported,
829 |             "YouTube": test_url_msg_youtube,
830 |             # "YouTube Needs Login": test_url_msg_youtube_login,  # Test fallback (AgentQL should handle public ones)
831 |         }
832 | 
833 |         for name, msg in test_cases.items():
834 |             print(f"\n{'=/' * 10} RUNNING TEST: {name} {'=/' * 10}")
835 |             print(f"Input message: {msg}")
836 |             result = await run_agent(msg)
837 |             print("\n--- FINAL RESULT --- ")
838 |             if result:
839 |                 # Ensure result is treated as a string before printing
840 |                 print(str(result))
841 |             else:
842 |                 # Handle the case where run_agent might return None (though it aims not to)
843 |                 print("Agent returned None or an empty result.")
844 |             print(f"{'=/' * 10} FINISHED TEST: {name} {'=/' * 10}\n")
845 | 
846 |     asyncio.run(main())
847 | 


--------------------------------------------------------------------------------
/agent_viz.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import marimo
 4 | 
 5 | __generated_with = "0.13.2"
 6 | app = marimo.App(width="medium")
 7 | 
 8 | 
 9 | @app.cell
10 | def _():
11 |     import marimo as mo
12 |     import nest_asyncio
13 | 
14 |     nest_asyncio.apply()  # Allow nested asyncio loops
15 | 
16 |     # --- The rest of your imports ---
17 |     from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles
18 |     from agent import build_graph  # Your import
19 |     # ---------------------------------
20 | 
21 |     app = build_graph()
22 | 
23 |     # Now this should work without the RuntimeError
24 |     png_bytes = app.get_graph().draw_mermaid_png(
25 |         draw_method=MermaidDrawMethod.PYPPETEER,
26 |     )
27 | 
28 |     # Display the image using marimo
29 |     mo.image(src=png_bytes, alt="LangGraph Flow", caption="Telegram Summarizer Bot Graph")
30 |     return
31 | 
32 | 
33 | @app.cell
34 | def _():
35 |     return
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     app.run()
40 | 


--------------------------------------------------------------------------------
/baml_client/__init__.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | #
 3 | #  Welcome to Baml! To use this generated code, please run the following:
 4 | #
 5 | #  $ pip install baml-py
 6 | #
 7 | ###############################################################################
 8 | 
 9 | # This file was generated by BAML: please do not edit it. Instead, edit the
10 | # BAML files and re-generate this code.
11 | #
12 | # ruff: noqa: E501,F401
13 | # flake8: noqa: E501,F401
14 | # pylint: disable=unused-import,line-too-long
15 | # fmt: off
16 | __version__ = "0.88.0"
17 | 
18 | try:
19 |   from baml_py.safe_import import EnsureBamlPyImport
20 | except ImportError:
21 |   raise ImportError(f"""Update to baml-py required.
22 | Version of baml_client generator (see generators.baml): {__version__}
23 | 
24 | Please upgrade baml-py to version "{__version__}".
25 | 
26 | $ pip install baml-py=={__version__}
27 | $ uv add baml-py=={__version__}
28 | 
29 | If nothing else works, please ask for help:
30 | 
31 | https://github.com/boundaryml/baml/issues
32 | https://boundaryml.com/discord
33 | """) from None
34 | 
35 | with EnsureBamlPyImport(__version__) as e:
36 |   e.raise_if_incompatible_version(__version__)
37 | 
38 |   from . import types
39 |   from . import tracing
40 |   from . import partial_types
41 |   from . import config
42 |   from .config import reset_baml_env_vars
43 |   
44 |   from .sync_client import b
45 |   
46 | 
47 | __all__ = [
48 |   "b",
49 |   "partial_types",
50 |   "tracing",
51 |   "types",
52 |   "reset_baml_env_vars",
53 |   "config",
54 | ]


--------------------------------------------------------------------------------
/baml_client/async_client.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | #
  3 | #  Welcome to Baml! To use this generated code, please run the following:
  4 | #
  5 | #  $ pip install baml-py
  6 | #
  7 | ###############################################################################
  8 | 
  9 | # This file was generated by BAML: please do not edit it. Instead, edit the
 10 | # BAML files and re-generate this code.
 11 | #
 12 | # ruff: noqa: E501,F401
 13 | # flake8: noqa: E501,F401
 14 | # pylint: disable=unused-import,line-too-long
 15 | # fmt: off
 16 | from typing import Any, Dict, List, Optional, TypeVar, Union, TypedDict, Type, Literal, cast
 17 | from typing_extensions import NotRequired
 18 | import pprint
 19 | 
 20 | import baml_py
 21 | from pydantic import BaseModel, ValidationError, create_model
 22 | 
 23 | from . import partial_types, types
 24 | from .types import Checked, Check
 25 | from .type_builder import TypeBuilder
 26 | from .parser import LlmResponseParser, LlmStreamParser
 27 | from .async_request import AsyncHttpRequest, AsyncHttpStreamRequest
 28 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME
 29 | 
 30 | OutputType = TypeVar('OutputType')
 31 | 
 32 | 
 33 | # Define the TypedDict with optional parameters having default values
 34 | class BamlCallOptions(TypedDict, total=False):
 35 |     tb: NotRequired[TypeBuilder]
 36 |     client_registry: NotRequired[baml_py.baml_py.ClientRegistry]
 37 |     collector: NotRequired[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]]
 38 | 
 39 | 
 40 | class BamlAsyncClient:
 41 |     __runtime: baml_py.BamlRuntime
 42 |     __ctx_manager: baml_py.BamlCtxManager
 43 |     __stream_client: "BamlStreamClient"
 44 |     __http_request: "AsyncHttpRequest"
 45 |     __http_stream_request: "AsyncHttpStreamRequest"
 46 |     __llm_response_parser: LlmResponseParser
 47 |     __llm_stream_parser: LlmStreamParser
 48 |     __baml_options: BamlCallOptions
 49 | 
 50 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None):
 51 |       self.__runtime = runtime
 52 |       self.__ctx_manager = ctx_manager
 53 |       self.__stream_client = BamlStreamClient(self.__runtime, self.__ctx_manager, baml_options)
 54 |       self.__http_request = AsyncHttpRequest(self.__runtime, self.__ctx_manager)
 55 |       self.__http_stream_request = AsyncHttpStreamRequest(self.__runtime, self.__ctx_manager)
 56 |       self.__llm_response_parser = LlmResponseParser(self.__runtime, self.__ctx_manager)
 57 |       self.__llm_stream_parser = LlmStreamParser(self.__runtime, self.__ctx_manager)
 58 |       self.__baml_options = baml_options or {}
 59 | 
 60 |     def with_options(
 61 |       self,
 62 |       tb: Optional[TypeBuilder] = None,
 63 |       client_registry: Optional[baml_py.baml_py.ClientRegistry] = None,
 64 |       collector: Optional[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]] = None,
 65 |     ) -> "BamlAsyncClient":
 66 |       """
 67 |       Returns a new instance of BamlAsyncClient with explicitly typed baml options
 68 |       for Python 3.8 compatibility.
 69 |       """
 70 |       new_options = self.__baml_options.copy()
 71 | 
 72 |       # Override if any keyword arguments were provided.
 73 |       if tb is not None:
 74 |           new_options["tb"] = tb
 75 |       if client_registry is not None:
 76 |           new_options["client_registry"] = client_registry
 77 |       if collector is not None:
 78 |           new_options["collector"] = collector
 79 | 
 80 |       return BamlAsyncClient(self.__runtime, self.__ctx_manager, new_options)
 81 | 
 82 |     @property
 83 |     def stream(self):
 84 |       return self.__stream_client
 85 | 
 86 |     @property
 87 |     def request(self):
 88 |       return self.__http_request
 89 | 
 90 |     @property
 91 |     def stream_request(self):
 92 |       return self.__http_stream_request
 93 | 
 94 |     @property
 95 |     def parse(self):
 96 |       return self.__llm_response_parser
 97 | 
 98 |     @property
 99 |     def parse_stream(self):
100 |       return self.__llm_stream_parser
101 | 
102 |     
103 |     async def RouteRequest(
104 |         self,
105 |         original_message: str,
106 |         baml_options: BamlCallOptions = {},
107 |     ) -> types.ExtractorTool:
108 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
109 | 
110 |       __tb__ = options.get("tb", None)
111 |       if __tb__ is not None:
112 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
113 |       else:
114 |         tb = None
115 |       __cr__ = options.get("client_registry", None)
116 |       collector = options.get("collector", None)
117 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
118 |       raw = await self.__runtime.call_function(
119 |         "RouteRequest",
120 |         {
121 |           "original_message": original_message,
122 |         },
123 |         self.__ctx_manager.get(),
124 |         tb,
125 |         __cr__,
126 |         collectors,
127 |       )
128 |       return cast(types.ExtractorTool, raw.cast_to(types, types, partial_types, False))
129 |     
130 |     async def SummarizeContent(
131 |         self,
132 |         content: str,content_type: types.ContentType,context: Optional[str],
133 |         baml_options: BamlCallOptions = {},
134 |     ) -> types.Summary:
135 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
136 | 
137 |       __tb__ = options.get("tb", None)
138 |       if __tb__ is not None:
139 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
140 |       else:
141 |         tb = None
142 |       __cr__ = options.get("client_registry", None)
143 |       collector = options.get("collector", None)
144 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
145 |       raw = await self.__runtime.call_function(
146 |         "SummarizeContent",
147 |         {
148 |           "content": content,"content_type": content_type,"context": context,
149 |         },
150 |         self.__ctx_manager.get(),
151 |         tb,
152 |         __cr__,
153 |         collectors,
154 |       )
155 |       return cast(types.Summary, raw.cast_to(types, types, partial_types, False))
156 |     
157 | 
158 | 
159 | class BamlStreamClient:
160 |     __runtime: baml_py.BamlRuntime
161 |     __ctx_manager: baml_py.BamlCtxManager
162 |     __baml_options: BamlCallOptions
163 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None):
164 |       self.__runtime = runtime
165 |       self.__ctx_manager = ctx_manager
166 |       self.__baml_options = baml_options or {}
167 | 
168 |     
169 |     def RouteRequest(
170 |         self,
171 |         original_message: str,
172 |         baml_options: BamlCallOptions = {},
173 |     ) -> baml_py.BamlStream[Optional[types.ExtractorTool], types.ExtractorTool]:
174 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
175 |       __tb__ = options.get("tb", None)
176 |       if __tb__ is not None:
177 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
178 |       else:
179 |         tb = None
180 |       __cr__ = options.get("client_registry", None)
181 |       collector = options.get("collector", None)
182 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
183 |       raw = self.__runtime.stream_function(
184 |         "RouteRequest",
185 |         {
186 |           "original_message": original_message,
187 |         },
188 |         None,
189 |         self.__ctx_manager.get(),
190 |         tb,
191 |         __cr__,
192 |         collectors,
193 |       )
194 | 
195 |       return baml_py.BamlStream[Optional[types.ExtractorTool], types.ExtractorTool](
196 |         raw,
197 |         lambda x: cast(Optional[types.ExtractorTool], x.cast_to(types, types, partial_types, True)),
198 |         lambda x: cast(types.ExtractorTool, x.cast_to(types, types, partial_types, False)),
199 |         self.__ctx_manager.get(),
200 |       )
201 |     
202 |     def SummarizeContent(
203 |         self,
204 |         content: str,content_type: types.ContentType,context: Optional[str],
205 |         baml_options: BamlCallOptions = {},
206 |     ) -> baml_py.BamlStream[partial_types.Summary, types.Summary]:
207 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
208 |       __tb__ = options.get("tb", None)
209 |       if __tb__ is not None:
210 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
211 |       else:
212 |         tb = None
213 |       __cr__ = options.get("client_registry", None)
214 |       collector = options.get("collector", None)
215 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
216 |       raw = self.__runtime.stream_function(
217 |         "SummarizeContent",
218 |         {
219 |           "content": content,
220 |           "content_type": content_type,
221 |           "context": context,
222 |         },
223 |         None,
224 |         self.__ctx_manager.get(),
225 |         tb,
226 |         __cr__,
227 |         collectors,
228 |       )
229 | 
230 |       return baml_py.BamlStream[partial_types.Summary, types.Summary](
231 |         raw,
232 |         lambda x: cast(partial_types.Summary, x.cast_to(types, types, partial_types, True)),
233 |         lambda x: cast(types.Summary, x.cast_to(types, types, partial_types, False)),
234 |         self.__ctx_manager.get(),
235 |       )
236 |     
237 | 
238 | 
239 | b = BamlAsyncClient(DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX)
240 | 
241 | __all__ = ["b"]


--------------------------------------------------------------------------------
/baml_client/async_request.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | #
  3 | #  Welcome to Baml! To use this generated code, please run the following:
  4 | #
  5 | #  $ pip install baml-py
  6 | #
  7 | ###############################################################################
  8 | 
  9 | # This file was generated by BAML: please do not edit it. Instead, edit the
 10 | # BAML files and re-generate this code.
 11 | #
 12 | # ruff: noqa: E501,F401
 13 | # flake8: noqa: E501,F401
 14 | # pylint: disable=unused-import,line-too-long
 15 | # fmt: off
 16 | from typing import Any, Dict, List, Optional, Union, TypedDict, Type, Literal
 17 | from typing_extensions import NotRequired
 18 | 
 19 | import baml_py
 20 | 
 21 | from . import types
 22 | from .types import Checked, Check
 23 | from .type_builder import TypeBuilder
 24 | 
 25 | 
 26 | class BamlCallOptions(TypedDict, total=False):
 27 |     tb: NotRequired[TypeBuilder]
 28 |     client_registry: NotRequired[baml_py.baml_py.ClientRegistry]
 29 | 
 30 | 
 31 | class AsyncHttpRequest:
 32 |     __runtime: baml_py.BamlRuntime
 33 |     __ctx_manager: baml_py.BamlCtxManager
 34 | 
 35 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager):
 36 |       self.__runtime = runtime
 37 |       self.__ctx_manager = ctx_manager
 38 | 
 39 |     
 40 |     async def RouteRequest(
 41 |         self,
 42 |         original_message: str,
 43 |         baml_options: BamlCallOptions = {},
 44 |     ) -> baml_py.HTTPRequest:
 45 |       __tb__ = baml_options.get("tb", None)
 46 |       if __tb__ is not None:
 47 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
 48 |       else:
 49 |         tb = None
 50 |       __cr__ = baml_options.get("client_registry", None)
 51 | 
 52 |       return await self.__runtime.build_request(
 53 |         "RouteRequest",
 54 |         {
 55 |           "original_message": original_message,
 56 |         },
 57 |         self.__ctx_manager.get(),
 58 |         tb,
 59 |         __cr__,
 60 |         False,
 61 |       )
 62 |     
 63 |     async def SummarizeContent(
 64 |         self,
 65 |         content: str,content_type: types.ContentType,context: Optional[str],
 66 |         baml_options: BamlCallOptions = {},
 67 |     ) -> baml_py.HTTPRequest:
 68 |       __tb__ = baml_options.get("tb", None)
 69 |       if __tb__ is not None:
 70 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
 71 |       else:
 72 |         tb = None
 73 |       __cr__ = baml_options.get("client_registry", None)
 74 | 
 75 |       return await self.__runtime.build_request(
 76 |         "SummarizeContent",
 77 |         {
 78 |           "content": content,
 79 |           "content_type": content_type,
 80 |           "context": context,
 81 |         },
 82 |         self.__ctx_manager.get(),
 83 |         tb,
 84 |         __cr__,
 85 |         False,
 86 |       )
 87 |     
 88 | 
 89 | 
 90 | class AsyncHttpStreamRequest:
 91 |     __runtime: baml_py.BamlRuntime
 92 |     __ctx_manager: baml_py.BamlCtxManager
 93 | 
 94 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager):
 95 |       self.__runtime = runtime
 96 |       self.__ctx_manager = ctx_manager
 97 | 
 98 |     
 99 |     async def RouteRequest(
100 |         self,
101 |         original_message: str,
102 |         baml_options: BamlCallOptions = {},
103 |     ) -> baml_py.HTTPRequest:
104 |       __tb__ = baml_options.get("tb", None)
105 |       if __tb__ is not None:
106 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
107 |       else:
108 |         tb = None
109 |       __cr__ = baml_options.get("client_registry", None)
110 | 
111 |       return await self.__runtime.build_request(
112 |         "RouteRequest",
113 |         {
114 |           "original_message": original_message,
115 |         },
116 |         self.__ctx_manager.get(),
117 |         tb,
118 |         __cr__,
119 |         True,
120 |       )
121 |     
122 |     async def SummarizeContent(
123 |         self,
124 |         content: str,content_type: types.ContentType,context: Optional[str],
125 |         baml_options: BamlCallOptions = {},
126 |     ) -> baml_py.HTTPRequest:
127 |       __tb__ = baml_options.get("tb", None)
128 |       if __tb__ is not None:
129 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
130 |       else:
131 |         tb = None
132 |       __cr__ = baml_options.get("client_registry", None)
133 | 
134 |       return await self.__runtime.build_request(
135 |         "SummarizeContent",
136 |         {
137 |           "content": content,
138 |           "content_type": content_type,
139 |           "context": context,
140 |         },
141 |         self.__ctx_manager.get(),
142 |         tb,
143 |         __cr__,
144 |         True,
145 |       )
146 |     
147 | 
148 | 
149 | __all__ = ["AsyncHttpRequest", "AsyncHttpStreamRequest"]


--------------------------------------------------------------------------------
/baml_client/config.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | #
 3 | #  Welcome to Baml! To use this generated code, please run the following:
 4 | #
 5 | #  $ pip install baml-py
 6 | #
 7 | ###############################################################################
 8 | 
 9 | # This file was generated by BAML: please do not edit it. Instead, edit the
10 | # BAML files and re-generate this code.
11 | #
12 | # ruff: noqa: E501,F401
13 | # flake8: noqa: E501,F401
14 | # pylint: disable=unused-import,line-too-long
15 | # fmt: off
16 | from baml_py.logging import set_log_level, get_log_level, set_log_json_mode, set_log_max_chunk_length
17 | from .globals import reset_baml_env_vars
18 | 
19 | __all__ = ["set_log_level", "get_log_level", "set_log_json_mode", "reset_baml_env_vars", "set_log_max_chunk_length"]


--------------------------------------------------------------------------------
/baml_client/globals.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | #
 3 | #  Welcome to Baml! To use this generated code, please run the following:
 4 | #
 5 | #  $ pip install baml-py
 6 | #
 7 | ###############################################################################
 8 | 
 9 | # This file was generated by BAML: please do not edit it. Instead, edit the
10 | # BAML files and re-generate this code.
11 | #
12 | # ruff: noqa: E501,F401
13 | # flake8: noqa: E501,F401
14 | # pylint: disable=unused-import,line-too-long
15 | # fmt: off
16 | from __future__ import annotations
17 | import os
18 | 
19 | from baml_py import BamlCtxManager, BamlRuntime
20 | from baml_py.baml_py import BamlError
21 | from .inlinedbaml import get_baml_files
22 | from typing_extensions import Literal
23 | from typing import Dict, Any
24 | 
25 | DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME = BamlRuntime.from_files(
26 |   "baml_src",
27 |   get_baml_files(),
28 |   os.environ.copy()
29 | )
30 | DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX = BamlCtxManager(DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME)
31 | 
32 | def reset_baml_env_vars(env_vars: Dict[str, str]):
33 |   if DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.allow_reset():
34 |     DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME.reset(
35 |       "baml_src",
36 |       get_baml_files(),
37 |       env_vars
38 |     )
39 |     DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.reset()
40 |   else:
41 |     raise BamlError("Cannot reset BAML environment variables while there are active BAML contexts.")
42 | 
43 | try:
44 |     import dotenv
45 |     from unittest.mock import patch
46 | 
47 |     # Monkeypatch load_dotenv to call reset_baml_env_vars after execution
48 |     original_load_dotenv = dotenv.load_dotenv
49 | 
50 |     def patched_load_dotenv(*args: Any, **kwargs: Any) -> Any:
51 |         result = original_load_dotenv(*args, **kwargs)
52 |         try:
53 |             reset_baml_env_vars(os.environ.copy())
54 |         except BamlError:
55 |             # swallow the error
56 |             pass
57 |         return result
58 | 
59 |     patch('dotenv.load_dotenv', patched_load_dotenv).start()
60 | except ImportError:
61 |     # dotenv is not installed, so we do nothing
62 |     pass
63 | 
64 | __all__ = []


--------------------------------------------------------------------------------
/baml_client/inlinedbaml.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | #
 3 | #  Welcome to Baml! To use this generated code, please run the following:
 4 | #
 5 | #  $ pip install baml-py
 6 | #
 7 | ###############################################################################
 8 | 
 9 | # This file was generated by BAML: please do not edit it. Instead, edit the
10 | # BAML files and re-generate this code.
11 | #
12 | # ruff: noqa: E501,F401
13 | # flake8: noqa: E501,F401
14 | # pylint: disable=unused-import,line-too-long
15 | # fmt: off
16 | 
17 | file_map = {
18 |     
19 |     "clients.baml": "client<llm> Gemini2_5_flash {\n  provider google-ai\n  options {\n    model gemini-2.5-flash-preview-04-17\n    api_key env.GEMINI_API_KEY\n  }\n}\n\nclient<llm> Gemini2_5_pro {\n  provider google-ai\n  options {\n    model gemini-2.5-pro-exp-03-25\n    api_key env.GEMINI_API_KEY\n  }\n}\n\nclient<llm> DeepSeekR1 {\n  provider \"openai\"\n  options {\n    api_key env.DEEPSEEK_API_KEY\n    base_url \"https://api.deepseek.com\"\n    model \"deepseek-reasoner\"\n  }\n}\n\nclient<llm> DeepSeekV3 {\n  provider \"openai\"\n  options {\n    api_key env.DEEPSEEK_API_KEY\n    base_url \"https://api.deepseek.com\"\n    model \"deepseek-chat\"\n    temperature 0.1\n  }\n}\n\n// https://docs.boundaryml.com/docs/snippets/clients/fallback\nclient<llm> LLMFallback {\n  provider fallback\n  options {\n    // This will try the clients in order until one succeeds\n    strategy [DeepSeekV3, Gemini2_5_flash]\n  }\n}\n\n// https://docs.boundaryml.com/docs/snippets/clients/retry\nretry_policy Constant {\n  max_retries 3\n  // Strategy is optional\n  strategy {\n    type constant_delay\n    delay_ms 200\n  }\n}\n\nretry_policy Exponential {\n  max_retries 2\n  // Strategy is optional\n  strategy {\n    type exponential_backoff\n    delay_ms 300\n    multiplier 1.5\n    max_delay_ms 10000\n  }\n}",
20 |     "generators.baml": "// This helps use auto generate libraries you can use in the language of\n// your choice. You can have multiple generators if you use multiple languages.\n// Just ensure that the output_dir is different for each generator.\ngenerator target {\n    // Valid values: \"python/pydantic\", \"typescript\", \"ruby/sorbet\", \"rest/openapi\"\n    output_type \"python/pydantic\"\n\n    // Where the generated code will be saved (relative to baml_src/)\n    output_dir \"../\"\n\n    // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).\n    // The BAML VSCode extension version should also match this version.\n    version \"0.88.0\"\n\n    // Valid values: \"sync\", \"async\"\n    // This controls what `b.FunctionName()` will be (sync or async).\n    default_client_mode sync\n}\n",
21 |     "router.baml": "// Define the possible extraction tools\nenum ExtractorTool {\n  WebpageExtractor // For general webpages\n  PDFExtractor     // For PDF documents\n  TwitterExtractor // For Twitter/X URLs\n  LinkedInExtractor // For LinkedIn post URLs\n  Unsupported      // For URLs or content types we cannot handle\n  YoutubeExtractor // For YouTube video URLs\n}\n\n// Define the router function\n// It takes the original message and decides which tool to use.\nfunction RouteRequest(original_message: string) -> ExtractorTool {\n  // Use a capable but fast client for routing\n//   client Gemini2_5_flash\n  client LLMFallback\n\n  prompt #\"\n    Analyze the following user message and determine the best tool to use for extracting content from any URL present.\n\n    User Message:\n    ---\n    {{ original_message }}\n    ---\n\n    Identify the primary URL in the message. Based *only* on the URL's structure or file extension, choose one of the following tools:\n\n    - If the URL points to a PDF file (ends with .pdf), choose PDFExtractor.\n    - If the URL is from Twitter or X (contains twitter.com or x.com), choose TwitterExtractor.\n    - If the URL is a LinkedIn post (contains linkedin.com/posts/), choose LinkedInExtractor.\n    - If the URL is a YouTube video (contains youtube.com/watch or youtu.be/), choose YoutubeExtractor.\n    - For all other standard web URLs (http or https), choose WebpageExtractor.\n    - If no URL is found, or the URL type is clearly unsupported (e.g., ftp://, mailto:), choose Unsupported.\n\n    Output *only* the name of the chosen tool from the 'ExtractorTool' enum.\n\n    {{ ctx.output_format }}\n  \"#\n}\n\n// Optional test case\ntest RouteWebpage {\n  functions [RouteRequest]\n  args {\n    original_message #\"Summarize this blog post: https://example.com/blog/article-123\"#\n  }\n}\n\ntest RoutePDF {\n  functions [RouteRequest]\n  args {\n    original_message #\"Can you process this PDF? https://arxiv.org/pdf/2401.0001.pdf\"#\n  }\n}\n\ntest RouteTwitter {\n  functions [RouteRequest]\n  args {\n    original_message #\"Look at this thread: https://x.com/user/status/12345\"#\n  }\n}\n\ntest RouteLinkedIn {\n  functions [RouteRequest]\n  args {\n    original_message #\"Interesting thoughts here: https://www.linkedin.com/posts/someuser_activity-1234567890-abcd?utm_source=share\"#\n  }\n}\n\ntest RouteNoURL {\n  functions [RouteRequest]\n  args {\n    original_message #\"Tell me a joke.\"#\n  }\n}\n\ntest RouteUnsupported {\n  functions [RouteRequest]\n  args {\n    original_message #\"Check this out: ftp://files.example.com/data.zip\"#\n  }\n}\n\ntest RouteYouTube {\n  functions [RouteRequest]\n  args {\n    original_message #\"Summarize this video: https://www.youtube.com/watch?v=dQw4w9WgXcQ\"#\n  }\n}\n\n",
22 |     "summarize.baml": "\n// Define an enum for the type of content being summarized\nenum ContentType {\n  Webpage\n  PDF\n  GenericText\n}\n\n// Define the structure for the summarization output\nclass Summary {\n  title string @description(\"A concise and informative title for the summarized content(max 10 words).\")\n  key_points string[] @description(\"A list of the most important points or takeaways from the content. (3-5 points)\")\n  concise_summary string @description(\"A brief paragraph summarizing the entire content. (50-100 words)\")\n}\n\n// Define the main summarization function\n// This function handles shorter texts directly or uses context for RAG-based summaries.\nfunction SummarizeContent(content: string, content_type: ContentType, context: string?) -> Summary {\n  client LLMFallback\n  prompt #\"\n    You are an expert summarization engine. Your goal is to provide a clear and concise summary of the given text.\n\n    Content Type: {{ content_type }}\n    {% if context %}\n    Relevant Context (from RAG):\n    ---\n    {{ context }}\n    ---\n    {% endif %}\n\n    Original Content:\n    ---\n    {{ content }}\n    ---\n\n    Based *only* on the provided Original Content {% if context %}and the Relevant Context{% endif %}, generate the answer.\n\n    Format your response strictly as the 'Summary' class structure. Ensure the title, key points, and summary are distinct and accurately reflect the source material. \n    Do not include any information not present in the provided text or context.\n\n    # Instructions\n    - If the long is for a paper, you need to explain what the paper is trying to solve and how, in separate sections: \n      '## What is the problem the paper is trying to solve?'\n      '## How does the paper attempt to solve the problem?'\n    - If it's a blog post or webpage, you have to explain like: 'This post or blog or webpage is about ...'\n    - If it's a github repo, you have to explain like: 'This github repo is about ... and tries to solve .... It uses ...'\n    - If it's an arxive or any other paper, do not mention info about DIO or under process or stuff like that. Just mentione the main points about the paper.\n    \n    ----\n    {{ ctx.output_format}}\n  \"#\n}\n\ntest SummarizeTest {\n  functions [SummarizeContent]\n  args {\n    content #\"\n      The Urgency of Interpretability\nApril 2025\nIn the decade that I have been working on AI, I’ve watched it grow from a tiny academic field to arguably the most important economic and geopolitical issue in the world.  In all that time, perhaps the most important lesson I’ve learned is this: the progress of the underlying technology is inexorable, driven by forces too powerful to stop, but the way in which it happens—the order in which things are built, the applications we choose, and the details of how it is rolled out to society—are eminently possible to change, and it’s possible to have great positive impact by doing so.  We can’t stop the bus, but we can steer it.  In the past I’ve written about the importance of deploying AI in a way that is positive for the world, and of ensuring that democracies build and wield the technology before autocracies do.  Over the last few months, I have become increasingly focused on an additional opportunity for steering the bus: the tantalizing possibility, opened up by some recent advances, that we could succeed at interpretability—that is, in understanding the inner workings of AI systems—before models reach an overwhelming level of power.\n    \"#\n    content_type #\"Webpage\"#\n  }\n}\n",
23 | }
24 | 
25 | def get_baml_files():
26 |     return file_map


--------------------------------------------------------------------------------
/baml_client/parser.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | #
  3 | #  Welcome to Baml! To use this generated code, please run the following:
  4 | #
  5 | #  $ pip install baml-py
  6 | #
  7 | ###############################################################################
  8 | 
  9 | # This file was generated by BAML: please do not edit it. Instead, edit the
 10 | # BAML files and re-generate this code.
 11 | #
 12 | # ruff: noqa: E501,F401
 13 | # flake8: noqa: E501,F401
 14 | # pylint: disable=unused-import,line-too-long
 15 | # fmt: off
 16 | from typing import Any, Dict, List, Optional, Union, TypedDict, Type, Literal, cast
 17 | from typing_extensions import NotRequired
 18 | 
 19 | import baml_py
 20 | 
 21 | from . import types, partial_types
 22 | from .types import Checked, Check
 23 | from .type_builder import TypeBuilder
 24 | 
 25 | 
 26 | class BamlCallOptions(TypedDict, total=False):
 27 |     tb: NotRequired[TypeBuilder]
 28 |     client_registry: NotRequired[baml_py.baml_py.ClientRegistry]
 29 | 
 30 | 
 31 | class LlmResponseParser:
 32 |     __runtime: baml_py.BamlRuntime
 33 |     __ctx_manager: baml_py.BamlCtxManager
 34 | 
 35 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager):
 36 |       self.__runtime = runtime
 37 |       self.__ctx_manager = ctx_manager
 38 | 
 39 |     
 40 |     def RouteRequest(
 41 |         self,
 42 |         llm_response: str,
 43 |         baml_options: BamlCallOptions = {},
 44 |     ) -> types.ExtractorTool:
 45 |       __tb__ = baml_options.get("tb", None)
 46 |       if __tb__ is not None:
 47 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
 48 |       else:
 49 |         tb = None
 50 |       __cr__ = baml_options.get("client_registry", None)
 51 | 
 52 |       parsed = self.__runtime.parse_llm_response(
 53 |         "RouteRequest",
 54 |         llm_response,
 55 |         types,
 56 |         types,
 57 |         partial_types,
 58 |         False,
 59 |         self.__ctx_manager.get(),
 60 |         tb,
 61 |         __cr__,
 62 |       )
 63 | 
 64 |       return cast(types.ExtractorTool, parsed)
 65 |     
 66 |     def SummarizeContent(
 67 |         self,
 68 |         llm_response: str,
 69 |         baml_options: BamlCallOptions = {},
 70 |     ) -> types.Summary:
 71 |       __tb__ = baml_options.get("tb", None)
 72 |       if __tb__ is not None:
 73 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
 74 |       else:
 75 |         tb = None
 76 |       __cr__ = baml_options.get("client_registry", None)
 77 | 
 78 |       parsed = self.__runtime.parse_llm_response(
 79 |         "SummarizeContent",
 80 |         llm_response,
 81 |         types,
 82 |         types,
 83 |         partial_types,
 84 |         False,
 85 |         self.__ctx_manager.get(),
 86 |         tb,
 87 |         __cr__,
 88 |       )
 89 | 
 90 |       return cast(types.Summary, parsed)
 91 |     
 92 | 
 93 | 
 94 | class LlmStreamParser:
 95 |     __runtime: baml_py.BamlRuntime
 96 |     __ctx_manager: baml_py.BamlCtxManager
 97 | 
 98 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager):
 99 |       self.__runtime = runtime
100 |       self.__ctx_manager = ctx_manager
101 | 
102 |     
103 |     def RouteRequest(
104 |         self,
105 |         llm_response: str,
106 |         baml_options: BamlCallOptions = {},
107 |     ) -> Optional[types.ExtractorTool]:
108 |       __tb__ = baml_options.get("tb", None)
109 |       if __tb__ is not None:
110 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
111 |       else:
112 |         tb = None
113 |       __cr__ = baml_options.get("client_registry", None)
114 | 
115 |       parsed = self.__runtime.parse_llm_response(
116 |         "RouteRequest",
117 |         llm_response,
118 |         types,
119 |         types,
120 |         partial_types,
121 |         True,
122 |         self.__ctx_manager.get(),
123 |         tb,
124 |         __cr__,
125 |       )
126 | 
127 |       return cast(Optional[types.ExtractorTool], parsed)
128 |     
129 |     def SummarizeContent(
130 |         self,
131 |         llm_response: str,
132 |         baml_options: BamlCallOptions = {},
133 |     ) -> partial_types.Summary:
134 |       __tb__ = baml_options.get("tb", None)
135 |       if __tb__ is not None:
136 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
137 |       else:
138 |         tb = None
139 |       __cr__ = baml_options.get("client_registry", None)
140 | 
141 |       parsed = self.__runtime.parse_llm_response(
142 |         "SummarizeContent",
143 |         llm_response,
144 |         types,
145 |         types,
146 |         partial_types,
147 |         True,
148 |         self.__ctx_manager.get(),
149 |         tb,
150 |         __cr__,
151 |       )
152 | 
153 |       return cast(partial_types.Summary, parsed)
154 |     
155 | 
156 | 
157 | __all__ = ["LlmResponseParser", "LlmStreamParser"]


--------------------------------------------------------------------------------
/baml_client/partial_types.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | #
 3 | #  Welcome to Baml! To use this generated code, please run the following:
 4 | #
 5 | #  $ pip install baml-py
 6 | #
 7 | ###############################################################################
 8 | 
 9 | # This file was generated by BAML: please do not edit it. Instead, edit the
10 | # BAML files and re-generate this code.
11 | #
12 | # ruff: noqa: E501,F401
13 | # flake8: noqa: E501,F401
14 | # pylint: disable=unused-import,line-too-long
15 | # fmt: off
16 | import baml_py
17 | from enum import Enum
18 | from pydantic import BaseModel, ConfigDict
19 | from typing_extensions import TypeAlias
20 | from typing import Dict, Generic, List, Optional, TypeVar, Union, Literal
21 | 
22 | from . import types
23 | from .types import Checked, Check
24 | 
25 | ###############################################################################
26 | #
27 | #  These types are used for streaming, for when an instance of a type
28 | #  is still being built up and any of its fields is not yet fully available.
29 | #
30 | ###############################################################################
31 | 
32 | T = TypeVar('T')
33 | class StreamState(BaseModel, Generic[T]):
34 |     value: T
35 |     state: Literal["Pending", "Incomplete", "Complete"]
36 | 
37 | 
38 | class Summary(BaseModel):
39 |     title: Optional[str] = None
40 |     key_points: List[str]
41 |     concise_summary: Optional[str] = None
42 | 


--------------------------------------------------------------------------------
/baml_client/sync_client.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | #
  3 | #  Welcome to Baml! To use this generated code, please run the following:
  4 | #
  5 | #  $ pip install baml-py
  6 | #
  7 | ###############################################################################
  8 | 
  9 | # This file was generated by BAML: please do not edit it. Instead, edit the
 10 | # BAML files and re-generate this code.
 11 | #
 12 | # ruff: noqa: E501,F401
 13 | # flake8: noqa: E501,F401
 14 | # pylint: disable=unused-import,line-too-long
 15 | # fmt: off
 16 | from typing import Any, Dict, List, Optional, TypeVar, Union, TypedDict, Type, Literal, cast
 17 | from typing_extensions import NotRequired
 18 | import pprint
 19 | 
 20 | import baml_py
 21 | from pydantic import BaseModel, ValidationError, create_model
 22 | 
 23 | from . import partial_types, types
 24 | from .types import Checked, Check
 25 | from .type_builder import TypeBuilder
 26 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME
 27 | from .sync_request import HttpRequest, HttpStreamRequest
 28 | from .parser import LlmResponseParser, LlmStreamParser
 29 | 
 30 | OutputType = TypeVar('OutputType')
 31 | 
 32 | 
 33 | # Define the TypedDict with optional parameters having default values
 34 | class BamlCallOptions(TypedDict, total=False):
 35 |     tb: NotRequired[TypeBuilder]
 36 |     client_registry: NotRequired[baml_py.baml_py.ClientRegistry]
 37 |     collector: NotRequired[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]]
 38 | 
 39 | 
 40 | class BamlSyncClient:
 41 |     __runtime: baml_py.BamlRuntime
 42 |     __ctx_manager: baml_py.BamlCtxManager
 43 |     __stream_client: "BamlStreamClient"
 44 |     __http_request: "HttpRequest"
 45 |     __http_stream_request: "HttpStreamRequest"
 46 |     __llm_response_parser: LlmResponseParser
 47 |     __baml_options: BamlCallOptions
 48 | 
 49 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None):
 50 |       self.__runtime = runtime
 51 |       self.__ctx_manager = ctx_manager
 52 |       self.__stream_client = BamlStreamClient(self.__runtime, self.__ctx_manager, baml_options)
 53 |       self.__http_request = HttpRequest(self.__runtime, self.__ctx_manager)
 54 |       self.__http_stream_request = HttpStreamRequest(self.__runtime, self.__ctx_manager)
 55 |       self.__llm_response_parser = LlmResponseParser(self.__runtime, self.__ctx_manager)
 56 |       self.__llm_stream_parser = LlmStreamParser(self.__runtime, self.__ctx_manager)
 57 |       self.__baml_options = baml_options or {}
 58 | 
 59 |     @property
 60 |     def stream(self):
 61 |       return self.__stream_client
 62 | 
 63 |     @property
 64 |     def request(self):
 65 |       return self.__http_request
 66 | 
 67 |     @property
 68 |     def stream_request(self):
 69 |       return self.__http_stream_request
 70 | 
 71 |     @property
 72 |     def parse(self):
 73 |       return self.__llm_response_parser
 74 | 
 75 |     @property
 76 |     def parse_stream(self):
 77 |       return self.__llm_stream_parser
 78 | 
 79 |     def with_options(
 80 |       self,
 81 |       tb: Optional[TypeBuilder] = None,
 82 |       client_registry: Optional[baml_py.baml_py.ClientRegistry] = None,
 83 |       collector: Optional[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]] = None,
 84 |     ) -> "BamlSyncClient":
 85 |       """
 86 |       Returns a new instance of BamlSyncClient with explicitly typed baml options
 87 |       for Python 3.8 compatibility.
 88 |       """
 89 |       new_options: BamlCallOptions = self.__baml_options.copy()
 90 | 
 91 |       # Override if any keyword arguments were provided.
 92 |       if tb is not None:
 93 |           new_options["tb"] = tb
 94 |       if client_registry is not None:
 95 |           new_options["client_registry"] = client_registry
 96 |       if collector is not None:
 97 |           new_options["collector"] = collector
 98 |       return BamlSyncClient(self.__runtime, self.__ctx_manager, new_options)
 99 | 
100 |     
101 |     def RouteRequest(
102 |         self,
103 |         original_message: str,
104 |         baml_options: BamlCallOptions = {},
105 |     ) -> types.ExtractorTool:
106 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
107 |       __tb__ = options.get("tb", None)
108 |       if __tb__ is not None:
109 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
110 |       else:
111 |         tb = None
112 |       __cr__ = options.get("client_registry", None)
113 |       collector = options.get("collector", None)
114 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
115 | 
116 |       raw = self.__runtime.call_function_sync(
117 |         "RouteRequest",
118 |         {
119 |           "original_message": original_message,
120 |         },
121 |         self.__ctx_manager.get(),
122 |         tb,
123 |         __cr__,
124 |         collectors,
125 |       )
126 |       return cast(types.ExtractorTool, raw.cast_to(types, types, partial_types, False))
127 |     
128 |     def SummarizeContent(
129 |         self,
130 |         content: str,content_type: types.ContentType,context: Optional[str],
131 |         baml_options: BamlCallOptions = {},
132 |     ) -> types.Summary:
133 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
134 |       __tb__ = options.get("tb", None)
135 |       if __tb__ is not None:
136 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
137 |       else:
138 |         tb = None
139 |       __cr__ = options.get("client_registry", None)
140 |       collector = options.get("collector", None)
141 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
142 | 
143 |       raw = self.__runtime.call_function_sync(
144 |         "SummarizeContent",
145 |         {
146 |           "content": content,"content_type": content_type,"context": context,
147 |         },
148 |         self.__ctx_manager.get(),
149 |         tb,
150 |         __cr__,
151 |         collectors,
152 |       )
153 |       return cast(types.Summary, raw.cast_to(types, types, partial_types, False))
154 |     
155 | 
156 | 
157 | 
158 | class BamlStreamClient:
159 |     __runtime: baml_py.BamlRuntime
160 |     __ctx_manager: baml_py.BamlCtxManager
161 |     __baml_options: BamlCallOptions
162 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None):
163 |       self.__runtime = runtime
164 |       self.__ctx_manager = ctx_manager
165 |       self.__baml_options = baml_options or {}
166 | 
167 |     
168 |     def RouteRequest(
169 |         self,
170 |         original_message: str,
171 |         baml_options: BamlCallOptions = {},
172 |     ) -> baml_py.BamlSyncStream[Optional[types.ExtractorTool], types.ExtractorTool]:
173 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
174 |       __tb__ = options.get("tb", None)
175 |       if __tb__ is not None:
176 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
177 |       else:
178 |         tb = None
179 |       __cr__ = options.get("client_registry", None)
180 |       collector = options.get("collector", None)
181 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
182 | 
183 |       raw = self.__runtime.stream_function_sync(
184 |         "RouteRequest",
185 |         {
186 |           "original_message": original_message,
187 |         },
188 |         None,
189 |         self.__ctx_manager.get(),
190 |         tb,
191 |         __cr__,
192 |         collectors,
193 |       )
194 | 
195 |       return baml_py.BamlSyncStream[Optional[types.ExtractorTool], types.ExtractorTool](
196 |         raw,
197 |         lambda x: cast(Optional[types.ExtractorTool], x.cast_to(types, types, partial_types, True)),
198 |         lambda x: cast(types.ExtractorTool, x.cast_to(types, types, partial_types, False)),
199 |         self.__ctx_manager.get(),
200 |       )
201 |     
202 |     def SummarizeContent(
203 |         self,
204 |         content: str,content_type: types.ContentType,context: Optional[str],
205 |         baml_options: BamlCallOptions = {},
206 |     ) -> baml_py.BamlSyncStream[partial_types.Summary, types.Summary]:
207 |       options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})}
208 |       __tb__ = options.get("tb", None)
209 |       if __tb__ is not None:
210 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
211 |       else:
212 |         tb = None
213 |       __cr__ = options.get("client_registry", None)
214 |       collector = options.get("collector", None)
215 |       collectors = collector if isinstance(collector, list) else [collector] if collector is not None else []
216 | 
217 |       raw = self.__runtime.stream_function_sync(
218 |         "SummarizeContent",
219 |         {
220 |           "content": content,
221 |           "content_type": content_type,
222 |           "context": context,
223 |         },
224 |         None,
225 |         self.__ctx_manager.get(),
226 |         tb,
227 |         __cr__,
228 |         collectors,
229 |       )
230 | 
231 |       return baml_py.BamlSyncStream[partial_types.Summary, types.Summary](
232 |         raw,
233 |         lambda x: cast(partial_types.Summary, x.cast_to(types, types, partial_types, True)),
234 |         lambda x: cast(types.Summary, x.cast_to(types, types, partial_types, False)),
235 |         self.__ctx_manager.get(),
236 |       )
237 |     
238 | 
239 | 
240 | b = BamlSyncClient(DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX)
241 | 
242 | __all__ = ["b"]


--------------------------------------------------------------------------------
/baml_client/sync_request.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | #
  3 | #  Welcome to Baml! To use this generated code, please run the following:
  4 | #
  5 | #  $ pip install baml-py
  6 | #
  7 | ###############################################################################
  8 | 
  9 | # This file was generated by BAML: please do not edit it. Instead, edit the
 10 | # BAML files and re-generate this code.
 11 | #
 12 | # ruff: noqa: E501,F401
 13 | # flake8: noqa: E501,F401
 14 | # pylint: disable=unused-import,line-too-long
 15 | # fmt: off
 16 | from typing import Any, Dict, List, Optional, Union, TypedDict, Type, Literal
 17 | from typing_extensions import NotRequired
 18 | 
 19 | import baml_py
 20 | 
 21 | from . import types
 22 | from .types import Checked, Check
 23 | from .type_builder import TypeBuilder
 24 | 
 25 | 
 26 | class BamlCallOptions(TypedDict, total=False):
 27 |     tb: NotRequired[TypeBuilder]
 28 |     client_registry: NotRequired[baml_py.baml_py.ClientRegistry]
 29 | 
 30 | 
 31 | class HttpRequest:
 32 |     __runtime: baml_py.BamlRuntime
 33 |     __ctx_manager: baml_py.BamlCtxManager
 34 | 
 35 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager):
 36 |       self.__runtime = runtime
 37 |       self.__ctx_manager = ctx_manager
 38 | 
 39 |     
 40 |     def RouteRequest(
 41 |         self,
 42 |         original_message: str,
 43 |         baml_options: BamlCallOptions = {},
 44 |     ) -> baml_py.HTTPRequest:
 45 |       __tb__ = baml_options.get("tb", None)
 46 |       if __tb__ is not None:
 47 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
 48 |       else:
 49 |         tb = None
 50 |       __cr__ = baml_options.get("client_registry", None)
 51 | 
 52 |       return self.__runtime.build_request_sync(
 53 |         "RouteRequest",
 54 |         {
 55 |           "original_message": original_message,
 56 |         },
 57 |         self.__ctx_manager.get(),
 58 |         tb,
 59 |         __cr__,
 60 |         False,
 61 |       )
 62 |     
 63 |     def SummarizeContent(
 64 |         self,
 65 |         content: str,content_type: types.ContentType,context: Optional[str],
 66 |         baml_options: BamlCallOptions = {},
 67 |     ) -> baml_py.HTTPRequest:
 68 |       __tb__ = baml_options.get("tb", None)
 69 |       if __tb__ is not None:
 70 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
 71 |       else:
 72 |         tb = None
 73 |       __cr__ = baml_options.get("client_registry", None)
 74 | 
 75 |       return self.__runtime.build_request_sync(
 76 |         "SummarizeContent",
 77 |         {
 78 |           "content": content,"content_type": content_type,"context": context,
 79 |         },
 80 |         self.__ctx_manager.get(),
 81 |         tb,
 82 |         __cr__,
 83 |         False,
 84 |       )
 85 |     
 86 | 
 87 | 
 88 | class HttpStreamRequest:
 89 |     __runtime: baml_py.BamlRuntime
 90 |     __ctx_manager: baml_py.BamlCtxManager
 91 | 
 92 |     def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager):
 93 |       self.__runtime = runtime
 94 |       self.__ctx_manager = ctx_manager
 95 | 
 96 |     
 97 |     def RouteRequest(
 98 |         self,
 99 |         original_message: str,
100 |         baml_options: BamlCallOptions = {},
101 |     ) -> baml_py.HTTPRequest:
102 |       __tb__ = baml_options.get("tb", None)
103 |       if __tb__ is not None:
104 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
105 |       else:
106 |         tb = None
107 |       __cr__ = baml_options.get("client_registry", None)
108 | 
109 |       return self.__runtime.build_request_sync(
110 |         "RouteRequest",
111 |         {
112 |           "original_message": original_message,
113 |         },
114 |         self.__ctx_manager.get(),
115 |         tb,
116 |         __cr__,
117 |         True,
118 |       )
119 |     
120 |     def SummarizeContent(
121 |         self,
122 |         content: str,content_type: types.ContentType,context: Optional[str],
123 |         baml_options: BamlCallOptions = {},
124 |     ) -> baml_py.HTTPRequest:
125 |       __tb__ = baml_options.get("tb", None)
126 |       if __tb__ is not None:
127 |         tb = __tb__._tb # type: ignore (we know how to use this private attribute)
128 |       else:
129 |         tb = None
130 |       __cr__ = baml_options.get("client_registry", None)
131 | 
132 |       return self.__runtime.build_request_sync(
133 |         "SummarizeContent",
134 |         {
135 |           "content": content,"content_type": content_type,"context": context,
136 |         },
137 |         self.__ctx_manager.get(),
138 |         tb,
139 |         __cr__,
140 |         True,
141 |       )
142 |     
143 | 
144 | 
145 | __all__ = ["HttpRequest", "HttpStreamRequest"]


--------------------------------------------------------------------------------
/baml_client/tracing.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | #
 3 | #  Welcome to Baml! To use this generated code, please run the following:
 4 | #
 5 | #  $ pip install baml-py
 6 | #
 7 | ###############################################################################
 8 | 
 9 | # This file was generated by BAML: please do not edit it. Instead, edit the
10 | # BAML files and re-generate this code.
11 | #
12 | # ruff: noqa: E501,F401
13 | # flake8: noqa: E501,F401
14 | # pylint: disable=unused-import,line-too-long
15 | # fmt: off
16 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX
17 | 
18 | trace = DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.trace_fn
19 | set_tags = DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.upsert_tags
20 | def flush():
21 |   DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.flush()
22 | on_log_event = DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.on_log_event
23 | 
24 | 
25 | __all__ = ['trace', 'set_tags', "flush", "on_log_event"]


--------------------------------------------------------------------------------
/baml_client/type_builder.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | #
  3 | #  Welcome to Baml! To use this generated code, please run the following:
  4 | #
  5 | #  $ pip install baml-py
  6 | #
  7 | ###############################################################################
  8 | 
  9 | # This file was generated by BAML: please do not edit it. Instead, edit the
 10 | # BAML files and re-generate this code.
 11 | #
 12 | # ruff: noqa: E501,F401
 13 | # flake8: noqa: E501,F401
 14 | # pylint: disable=unused-import,line-too-long
 15 | # fmt: off
 16 | import typing
 17 | from baml_py.baml_py import FieldType, EnumValueBuilder, EnumBuilder, ClassBuilder
 18 | from baml_py.type_builder import TypeBuilder as _TypeBuilder, ClassPropertyBuilder, ClassPropertyViewer, EnumValueViewer
 19 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME
 20 | 
 21 | 
 22 | class TypeBuilder(_TypeBuilder):
 23 |     def __init__(self):
 24 |         super().__init__(classes=set(
 25 |           ["Summary",]
 26 |         ), enums=set(
 27 |           ["ContentType","ExtractorTool",]
 28 |         ), runtime=DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME)
 29 | 
 30 | 
 31 |     @property
 32 |     def Summary(self) -> "SummaryAst":
 33 |         return SummaryAst(self)
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | class SummaryAst:
 40 |     def __init__(self, tb: _TypeBuilder):
 41 |         _tb = tb._tb # type: ignore (we know how to use this private attribute)
 42 |         self._bldr = _tb.class_("Summary")
 43 |         self._properties: typing.Set[str] = set([ "title",  "key_points",  "concise_summary", ])
 44 |         self._props = SummaryProperties(self._bldr, self._properties)
 45 | 
 46 |     def type(self) -> FieldType:
 47 |         return self._bldr.field()
 48 | 
 49 |     @property
 50 |     def props(self) -> "SummaryProperties":
 51 |         return self._props
 52 | 
 53 | 
 54 | class SummaryViewer(SummaryAst):
 55 |     def __init__(self, tb: _TypeBuilder):
 56 |         super().__init__(tb)
 57 | 
 58 |     
 59 |     def list_properties(self) -> typing.List[typing.Tuple[str, ClassPropertyViewer]]:
 60 |         return [(name, ClassPropertyViewer(self._bldr.property(name))) for name in self._properties]
 61 | 
 62 | 
 63 | 
 64 | class SummaryProperties:
 65 |     def __init__(self, bldr: ClassBuilder, properties: typing.Set[str]):
 66 |         self.__bldr = bldr
 67 |         self.__properties = properties
 68 | 
 69 |     
 70 | 
 71 |     @property
 72 |     def title(self) -> ClassPropertyViewer:
 73 |         return ClassPropertyViewer(self.__bldr.property("title"))
 74 | 
 75 |     @property
 76 |     def key_points(self) -> ClassPropertyViewer:
 77 |         return ClassPropertyViewer(self.__bldr.property("key_points"))
 78 | 
 79 |     @property
 80 |     def concise_summary(self) -> ClassPropertyViewer:
 81 |         return ClassPropertyViewer(self.__bldr.property("concise_summary"))
 82 | 
 83 |     
 84 | 
 85 | 
 86 | 
 87 | class ContentTypeAst:
 88 |     def __init__(self, tb: _TypeBuilder):
 89 |         _tb = tb._tb # type: ignore (we know how to use this private attribute)
 90 |         self._bldr = _tb.enum("ContentType")
 91 |         self._values: typing.Set[str] = set([ "Webpage",  "PDF",  "GenericText", ])
 92 |         self._vals = ContentTypeValues(self._bldr, self._values)
 93 | 
 94 |     def type(self) -> FieldType:
 95 |         return self._bldr.field()
 96 | 
 97 |     @property
 98 |     def values(self) -> "ContentTypeValues":
 99 |         return self._vals
100 | 
101 | 
102 | class ContentTypeViewer(ContentTypeAst):
103 |     def __init__(self, tb: _TypeBuilder):
104 |         super().__init__(tb)
105 | 
106 |     def list_values(self) -> typing.List[typing.Tuple[str, EnumValueViewer]]:
107 |         return [(name, EnumValueViewer(self._bldr.value(name))) for name in self._values]
108 | 
109 | 
110 | class ContentTypeValues:
111 |     def __init__(self, enum_bldr: EnumBuilder, values: typing.Set[str]):
112 |         self.__bldr = enum_bldr
113 |         self.__values = values
114 | 
115 |     
116 | 
117 |     @property
118 |     def Webpage(self) -> EnumValueViewer:
119 |         return EnumValueViewer(self.__bldr.value("Webpage"))
120 |     
121 | 
122 |     @property
123 |     def PDF(self) -> EnumValueViewer:
124 |         return EnumValueViewer(self.__bldr.value("PDF"))
125 |     
126 | 
127 |     @property
128 |     def GenericText(self) -> EnumValueViewer:
129 |         return EnumValueViewer(self.__bldr.value("GenericText"))
130 |     
131 | 
132 |     
133 | 
134 | class ExtractorToolAst:
135 |     def __init__(self, tb: _TypeBuilder):
136 |         _tb = tb._tb # type: ignore (we know how to use this private attribute)
137 |         self._bldr = _tb.enum("ExtractorTool")
138 |         self._values: typing.Set[str] = set([ "WebpageExtractor",  "PDFExtractor",  "TwitterExtractor",  "LinkedInExtractor",  "Unsupported",  "YoutubeExtractor", ])
139 |         self._vals = ExtractorToolValues(self._bldr, self._values)
140 | 
141 |     def type(self) -> FieldType:
142 |         return self._bldr.field()
143 | 
144 |     @property
145 |     def values(self) -> "ExtractorToolValues":
146 |         return self._vals
147 | 
148 | 
149 | class ExtractorToolViewer(ExtractorToolAst):
150 |     def __init__(self, tb: _TypeBuilder):
151 |         super().__init__(tb)
152 | 
153 |     def list_values(self) -> typing.List[typing.Tuple[str, EnumValueViewer]]:
154 |         return [(name, EnumValueViewer(self._bldr.value(name))) for name in self._values]
155 | 
156 | 
157 | class ExtractorToolValues:
158 |     def __init__(self, enum_bldr: EnumBuilder, values: typing.Set[str]):
159 |         self.__bldr = enum_bldr
160 |         self.__values = values
161 | 
162 |     
163 | 
164 |     @property
165 |     def WebpageExtractor(self) -> EnumValueViewer:
166 |         return EnumValueViewer(self.__bldr.value("WebpageExtractor"))
167 |     
168 | 
169 |     @property
170 |     def PDFExtractor(self) -> EnumValueViewer:
171 |         return EnumValueViewer(self.__bldr.value("PDFExtractor"))
172 |     
173 | 
174 |     @property
175 |     def TwitterExtractor(self) -> EnumValueViewer:
176 |         return EnumValueViewer(self.__bldr.value("TwitterExtractor"))
177 |     
178 | 
179 |     @property
180 |     def LinkedInExtractor(self) -> EnumValueViewer:
181 |         return EnumValueViewer(self.__bldr.value("LinkedInExtractor"))
182 |     
183 | 
184 |     @property
185 |     def Unsupported(self) -> EnumValueViewer:
186 |         return EnumValueViewer(self.__bldr.value("Unsupported"))
187 |     
188 | 
189 |     @property
190 |     def YoutubeExtractor(self) -> EnumValueViewer:
191 |         return EnumValueViewer(self.__bldr.value("YoutubeExtractor"))
192 |     
193 | 
194 |     
195 | 
196 | 
197 | __all__ = ["TypeBuilder"]


--------------------------------------------------------------------------------
/baml_client/types.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | #
 3 | #  Welcome to Baml! To use this generated code, please run the following:
 4 | #
 5 | #  $ pip install baml-py
 6 | #
 7 | ###############################################################################
 8 | 
 9 | # This file was generated by BAML: please do not edit it. Instead, edit the
10 | # BAML files and re-generate this code.
11 | #
12 | # ruff: noqa: E501,F401
13 | # flake8: noqa: E501,F401
14 | # pylint: disable=unused-import,line-too-long
15 | # fmt: off
16 | import baml_py
17 | from enum import Enum
18 | from pydantic import BaseModel, ConfigDict
19 | from typing_extensions import TypeAlias
20 | from typing import Dict, Generic, List, Literal, Optional, TypeVar, Union
21 | 
22 | 
23 | T = TypeVar('T')
24 | CheckName = TypeVar('CheckName', bound=str)
25 | 
26 | class Check(BaseModel):
27 |     name: str
28 |     expression: str
29 |     status: str
30 | 
31 | class Checked(BaseModel, Generic[T,CheckName]):
32 |     value: T
33 |     checks: Dict[CheckName, Check]
34 | 
35 | def get_checks(checks: Dict[CheckName, Check]) -> List[Check]:
36 |     return list(checks.values())
37 | 
38 | def all_succeeded(checks: Dict[CheckName, Check]) -> bool:
39 |     return all(check.status == "succeeded" for check in get_checks(checks))
40 | 
41 | 
42 | 
43 | class ContentType(str, Enum):
44 |     
45 |     Webpage = "Webpage"
46 |     PDF = "PDF"
47 |     GenericText = "GenericText"
48 | 
49 | class ExtractorTool(str, Enum):
50 |     
51 |     WebpageExtractor = "WebpageExtractor"
52 |     PDFExtractor = "PDFExtractor"
53 |     TwitterExtractor = "TwitterExtractor"
54 |     LinkedInExtractor = "LinkedInExtractor"
55 |     Unsupported = "Unsupported"
56 |     YoutubeExtractor = "YoutubeExtractor"
57 | 
58 | class Summary(BaseModel):
59 |     title: str
60 |     key_points: List[str]
61 |     concise_summary: str
62 | 


--------------------------------------------------------------------------------
/baml_src/clients.baml:
--------------------------------------------------------------------------------
 1 | client<llm> Gemini2_5_flash {
 2 |   provider google-ai
 3 |   options {
 4 |     model gemini-2.5-flash-preview-04-17
 5 |     api_key env.GEMINI_API_KEY
 6 |   }
 7 | }
 8 | 
 9 | client<llm> Gemini2_5_pro {
10 |   provider google-ai
11 |   options {
12 |     model gemini-2.5-pro-exp-03-25
13 |     api_key env.GEMINI_API_KEY
14 |   }
15 | }
16 | 
17 | client<llm> DeepSeekR1 {
18 |   provider "openai"
19 |   options {
20 |     api_key env.DEEPSEEK_API_KEY
21 |     base_url "https://api.deepseek.com"
22 |     model "deepseek-reasoner"
23 |   }
24 | }
25 | 
26 | client<llm> DeepSeekV3 {
27 |   provider "openai"
28 |   options {
29 |     api_key env.DEEPSEEK_API_KEY
30 |     base_url "https://api.deepseek.com"
31 |     model "deepseek-chat"
32 |     temperature 0.1
33 |   }
34 | }
35 | 
36 | // https://docs.boundaryml.com/docs/snippets/clients/fallback
37 | client<llm> LLMFallback {
38 |   provider fallback
39 |   options {
40 |     // This will try the clients in order until one succeeds
41 |     strategy [DeepSeekV3, Gemini2_5_flash]
42 |   }
43 | }
44 | 
45 | // https://docs.boundaryml.com/docs/snippets/clients/retry
46 | retry_policy Constant {
47 |   max_retries 3
48 |   // Strategy is optional
49 |   strategy {
50 |     type constant_delay
51 |     delay_ms 200
52 |   }
53 | }
54 | 
55 | retry_policy Exponential {
56 |   max_retries 2
57 |   // Strategy is optional
58 |   strategy {
59 |     type exponential_backoff
60 |     delay_ms 300
61 |     multiplier 1.5
62 |     max_delay_ms 10000
63 |   }
64 | }


--------------------------------------------------------------------------------
/baml_src/generators.baml:
--------------------------------------------------------------------------------
 1 | // This helps use auto generate libraries you can use in the language of
 2 | // your choice. You can have multiple generators if you use multiple languages.
 3 | // Just ensure that the output_dir is different for each generator.
 4 | generator target {
 5 |     // Valid values: "python/pydantic", "typescript", "ruby/sorbet", "rest/openapi"
 6 |     output_type "python/pydantic"
 7 | 
 8 |     // Where the generated code will be saved (relative to baml_src/)
 9 |     output_dir "../"
10 | 
11 |     // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).
12 |     // The BAML VSCode extension version should also match this version.
13 |     version "0.88.0"
14 | 
15 |     // Valid values: "sync", "async"
16 |     // This controls what `b.FunctionName()` will be (sync or async).
17 |     default_client_mode sync
18 | }
19 | 


--------------------------------------------------------------------------------
/baml_src/router.baml:
--------------------------------------------------------------------------------
 1 | // Define the possible extraction tools
 2 | enum ExtractorTool {
 3 |   WebpageExtractor // For general webpages
 4 |   PDFExtractor     // For PDF documents
 5 |   TwitterExtractor // For Twitter/X URLs
 6 |   LinkedInExtractor // For LinkedIn post URLs
 7 |   Unsupported      // For URLs or content types we cannot handle
 8 |   YoutubeExtractor // For YouTube video URLs
 9 | }
10 | 
11 | // Define the router function
12 | // It takes the original message and decides which tool to use.
13 | function RouteRequest(original_message: string) -> ExtractorTool {
14 |   // Use a capable but fast client for routing
15 | //   client Gemini2_5_flash
16 |   client LLMFallback
17 | 
18 |   prompt #"
19 |     Analyze the following user message and determine the best tool to use for extracting content from any URL present.
20 | 
21 |     User Message:
22 |     ---
23 |     {{ original_message }}
24 |     ---
25 | 
26 |     Identify the primary URL in the message. Based *only* on the URL's structure or file extension, choose one of the following tools:
27 | 
28 |     - If the URL points to a PDF file (ends with .pdf), choose PDFExtractor.
29 |     - If the URL is from Twitter or X (contains twitter.com or x.com), choose TwitterExtractor.
30 |     - If the URL is a LinkedIn post (contains linkedin.com/posts/), choose LinkedInExtractor.
31 |     - If the URL is a YouTube video (contains youtube.com/watch or youtu.be/), choose YoutubeExtractor.
32 |     - For all other standard web URLs (http or https), choose WebpageExtractor.
33 |     - If no URL is found, or the URL type is clearly unsupported (e.g., ftp://, mailto:), choose Unsupported.
34 | 
35 |     Output *only* the name of the chosen tool from the 'ExtractorTool' enum.
36 | 
37 |     {{ ctx.output_format }}
38 |   "#
39 | }
40 | 
41 | // Optional test case
42 | test RouteWebpage {
43 |   functions [RouteRequest]
44 |   args {
45 |     original_message #"Summarize this blog post: https://example.com/blog/article-123"#
46 |   }
47 | }
48 | 
49 | test RoutePDF {
50 |   functions [RouteRequest]
51 |   args {
52 |     original_message #"Can you process this PDF? https://arxiv.org/pdf/2401.0001.pdf"#
53 |   }
54 | }
55 | 
56 | test RouteTwitter {
57 |   functions [RouteRequest]
58 |   args {
59 |     original_message #"Look at this thread: https://x.com/user/status/12345"#
60 |   }
61 | }
62 | 
63 | test RouteLinkedIn {
64 |   functions [RouteRequest]
65 |   args {
66 |     original_message #"Interesting thoughts here: https://www.linkedin.com/posts/someuser_activity-1234567890-abcd?utm_source=share"#
67 |   }
68 | }
69 | 
70 | test RouteNoURL {
71 |   functions [RouteRequest]
72 |   args {
73 |     original_message #"Tell me a joke."#
74 |   }
75 | }
76 | 
77 | test RouteUnsupported {
78 |   functions [RouteRequest]
79 |   args {
80 |     original_message #"Check this out: ftp://files.example.com/data.zip"#
81 |   }
82 | }
83 | 
84 | test RouteYouTube {
85 |   functions [RouteRequest]
86 |   args {
87 |     original_message #"Summarize this video: https://www.youtube.com/watch?v=dQw4w9WgXcQ"#
88 |   }
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/baml_src/summarize.baml:
--------------------------------------------------------------------------------
 1 | 
 2 | // Define an enum for the type of content being summarized
 3 | enum ContentType {
 4 |   Webpage
 5 |   PDF
 6 |   GenericText
 7 | }
 8 | 
 9 | // Define the structure for the summarization output
10 | class Summary {
11 |   title string @description("A concise and informative title for the summarized content(max 10 words).")
12 |   key_points string[] @description("A list of the most important points or takeaways from the content. (3-5 points)")
13 |   concise_summary string @description("A brief paragraph summarizing the entire content. (50-100 words)")
14 | }
15 | 
16 | // Define the main summarization function
17 | // This function handles shorter texts directly or uses context for RAG-based summaries.
18 | function SummarizeContent(content: string, content_type: ContentType, context: string?) -> Summary {
19 |   client LLMFallback
20 |   prompt #"
21 |     You are an expert summarization engine. Your goal is to provide a clear and concise summary of the given text.
22 | 
23 |     Content Type: {{ content_type }}
24 |     {% if context %}
25 |     Relevant Context (from RAG):
26 |     ---
27 |     {{ context }}
28 |     ---
29 |     {% endif %}
30 | 
31 |     Original Content:
32 |     ---
33 |     {{ content }}
34 |     ---
35 | 
36 |     Based *only* on the provided Original Content {% if context %}and the Relevant Context{% endif %}, generate the answer.
37 | 
38 |     Format your response strictly as the 'Summary' class structure. Ensure the title, key points, and summary are distinct and accurately reflect the source material. 
39 |     Do not include any information not present in the provided text or context.
40 | 
41 |     # Instructions
42 |     - If the long is for a paper, you need to explain what the paper is trying to solve and how, in separate sections: 
43 |       '## What is the problem the paper is trying to solve?'
44 |       '## How does the paper attempt to solve the problem?'
45 |     - If it's a blog post or webpage, you have to explain like: 'This post or blog or webpage is about ...'
46 |     - If it's a github repo, you have to explain like: 'This github repo is about ... and tries to solve .... It uses ...'
47 |     - If it's an arxive or any other paper, do not mention info about DIO or under process or stuff like that. Just mentione the main points about the paper.
48 |     
49 |     ----
50 |     {{ ctx.output_format}}
51 |   "#
52 | }
53 | 
54 | test SummarizeTest {
55 |   functions [SummarizeContent]
56 |   args {
57 |     content #"
58 |       The Urgency of Interpretability
59 | April 2025
60 | In the decade that I have been working on AI, I’ve watched it grow from a tiny academic field to arguably the most important economic and geopolitical issue in the world.  In all that time, perhaps the most important lesson I’ve learned is this: the progress of the underlying technology is inexorable, driven by forces too powerful to stop, but the way in which it happens—the order in which things are built, the applications we choose, and the details of how it is rolled out to society—are eminently possible to change, and it’s possible to have great positive impact by doing so.  We can’t stop the bus, but we can steer it.  In the past I’ve written about the importance of deploying AI in a way that is positive for the world, and of ensuring that democracies build and wield the technology before autocracies do.  Over the last few months, I have become increasingly focused on an additional opportunity for steering the bus: the tantalizing possibility, opened up by some recent advances, that we could succeed at interpretability—that is, in understanding the inner workings of AI systems—before models reach an overwhelming level of power.
61 |     "#
62 |     content_type #"Webpage"#
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/bot.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from contextlib import asynccontextmanager
  4 | from dotenv import load_dotenv
  5 | import asyncio
  6 | import re
  7 | import html  # <-- Add this import
  8 | import json
  9 | 
 10 | from fastapi import FastAPI, Request, Response, HTTPException, Header, APIRouter
 11 | import uvicorn
 12 | 
 13 | from telegram import Update
 14 | from telegram.ext import Application, MessageHandler, filters, ContextTypes
 15 | from telegram.constants import ParseMode
 16 | 
 17 | # Import the agent runner
 18 | from agent import run_agent
 19 | 
 20 | # Load environment variables from .env file
 21 | load_dotenv(override=True)
 22 | 
 23 | # --- Logging Setup ---
 24 | logging.basicConfig(
 25 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 26 | )
 27 | logging.getLogger("httpx").setLevel(logging.WARNING)
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | # --- Constants ---
 31 | # Simple regex to find the first URL in a message
 32 | URL_REGEX = r"(https?:\/\/[^\s]+)"
 33 | 
 34 | # --- Environment Variables & Constants ---
 35 | BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
 36 | # For Cloud Run, we should use the service URL as the webhook URL
 37 | # if not explicitly set through WEBHOOK_URL
 38 | WEBHOOK_URL = os.getenv("WEBHOOK_URL")
 39 | WEBHOOK_SECRET_PATH = os.getenv("WEBHOOK_SECRET_PATH", "webhook")
 40 | 
 41 | # If we're in Cloud Run, we'll see these environment variables
 42 | CLOUD_RUN_SERVICE_URL = os.getenv("K_SERVICE")  # Will be set in Cloud Run
 43 | 
 44 | if not BOT_TOKEN:
 45 |     logger.critical("TELEGRAM_BOT_TOKEN missing. Bot cannot start.")
 46 |     exit()
 47 | 
 48 | # If we're in Cloud Run but no WEBHOOK_URL is set, use inference
 49 | if CLOUD_RUN_SERVICE_URL and not WEBHOOK_URL:
 50 |     WEBHOOK_URL = f"https://{os.getenv('K_SERVICE')}-{os.getenv('K_REVISION', 'latest')}.{os.getenv('K_REGION', 'unknown')}.run.app"
 51 |     logger.info(f"Running in Cloud Run, inferred WEBHOOK_URL: {WEBHOOK_URL}")
 52 | 
 53 | if not WEBHOOK_URL:
 54 |     logger.warning(
 55 |         "WEBHOOK_URL missing. Webhook setup will be skipped (local testing?)."
 56 |     )
 57 | 
 58 | 
 59 | # --- Global Application Object ---
 60 | ptb_app = Application.builder().token(BOT_TOKEN).build()
 61 | 
 62 | 
 63 | # --- Message Handler ---
 64 | async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
 65 |     message = update.effective_message
 66 |     text = message.text
 67 |     chat_id = message.chat_id
 68 |     logger.info(f"Received message in chat {chat_id}: {text}")
 69 | 
 70 |     # Simple check for URL
 71 |     if not any(url in text for url in ["http://", "https://"]):
 72 |         logger.info("Message does not contain a URL, ignoring.")
 73 |         return
 74 | 
 75 |     # Extract the first URL
 76 |     url_match = re.search(URL_REGEX, text)
 77 |     if not url_match:
 78 |         logger.info("No URL found in the message despite initial check. Ignoring.")
 79 |         return
 80 | 
 81 |     extracted_url = url_match.group(0)
 82 |     logger.info(f"Extracted URL: {extracted_url}")
 83 | 
 84 |     try:
 85 |         # Run the agent
 86 |         agent_result = await run_agent(text)
 87 | 
 88 |         # --- Process Agent Result ---
 89 |         MAX_LEN = 4096  # Max Telegram message length
 90 | 
 91 |         # Agent returns string (summary or error) or None.
 92 |         # Only proceed if we got a valid summary string (not starting with "Error:").
 93 |         if isinstance(agent_result, str) and not agent_result.startswith("Error:"):
 94 |             logger.info(
 95 |                 f"Agent returned valid summary (len {len(agent_result)} chars). Preparing message."
 96 |             )
 97 | 
 98 |             # Use agent result directly as the raw text to send (URL removed)
 99 |             text_to_send_raw = agent_result
100 | 
101 |             # Escape HTML characters for summary part to prevent parsing errors
102 |             text_to_send_formatted = html.escape(agent_result)
103 | 
104 |             # Send text in chunks if too long
105 |             for i in range(0, len(text_to_send_formatted), MAX_LEN):
106 |                 chunk = text_to_send_formatted[i : i + MAX_LEN]
107 |                 try:
108 |                     # Use HTML parse mode as we escaped the summary
109 |                     # Reply to the original message instead of just sending
110 |                     await message.reply_text(chunk, parse_mode=ParseMode.HTML)
111 |                     logger.info(f"Sent chunk {i // MAX_LEN + 1} successfully.")
112 |                 except Exception as send_err:
113 |                     logger.error(
114 |                         f"Failed to send chunk with HTML formatting: {send_err}. Trying plain text."
115 |                     )
116 |                     # Fallback to sending raw chunk without formatting if HTML fails
117 |                     raw_chunk = text_to_send_raw[i : i + MAX_LEN]
118 |                     try:
119 |                         # Reply to the original message instead of just sending
120 |                         await message.reply_text(raw_chunk)
121 |                         logger.info(
122 |                             f"Sent chunk {i // MAX_LEN + 1} successfully (plain text fallback)."
123 |                         )
124 |                     except Exception as plain_send_err:
125 |                         logger.error(
126 |                             f"Failed to send chunk even as plain text: {plain_send_err}"
127 |                         )
128 |                         # Stop sending chunks if even plain text fails for one
129 |                         break
130 | 
131 |                 if i + MAX_LEN < len(text_to_send_formatted):
132 |                     await asyncio.sleep(0.5)  # Small delay between chunks
133 | 
134 |         # --- Silent Failure Cases ---
135 |         elif isinstance(agent_result, str) and agent_result.startswith("Error:"):
136 |             # Agent returned an error string
137 |             logger.error(
138 |                 f"Agent failed for {extracted_url}. Error: {agent_result}. Not replying."
139 |             )
140 |             # Do nothing in the chat
141 | 
142 |         else:
143 |             # Agent returned None or unexpected type
144 |             if agent_result is None:
145 |                 logger.error(f"Agent returned None for {extracted_url}. Not replying.")
146 |             else:
147 |                 logger.error(
148 |                     f"Agent returned unexpected result type for {extracted_url}: {type(agent_result)}. Not replying."
149 |                 )
150 |             # Do nothing in the chat
151 | 
152 |     except Exception as e:
153 |         # --- Main Execution Error ---
154 |         # Log the error but do not send anything to the user
155 |         logger.error(
156 |             f"Unhandled exception processing message for URL {extracted_url}: {e}",
157 |             exc_info=True,
158 |         )
159 |         # Removed user-facing error reporting
160 | 
161 |     # Removed the finally block as the thinking_message is gone
162 | 
163 | 
164 | # --- FastAPI Lifespan Management (Setup/Teardown) ---
165 | @asynccontextmanager
166 | async def lifespan(app: FastAPI):
167 |     # --- Startup ---
168 |     logger.info("Application startup...")
169 |     global ptb_app  # Make sure we're modifying the global instance
170 | 
171 |     should_use_polling = os.getenv("USE_POLLING", "false").lower() == "true"
172 | 
173 |     logger.info("Initializing PTB application...")
174 |     await ptb_app.initialize()
175 |     url_handler = MessageHandler(filters.TEXT & (~filters.COMMAND), handle_message)
176 |     ptb_app.add_handler(url_handler)
177 |     await ptb_app.start()  # Start application components (like scheduler, etc.)
178 | 
179 |     polling_task = None
180 |     if should_use_polling:
181 |         logger.info(
182 |             "Polling mode is active. Starting PTB polling loop in background..."
183 |         )
184 |         # Start polling in a background task so it doesn't block Uvicorn
185 |         polling_task = asyncio.create_task(
186 |             ptb_app.updater.start_polling(poll_interval=1.0)
187 |         )
188 |         logger.info("PTB polling loop started.")
189 | 
190 |     elif WEBHOOK_URL:  # Webhook mode
191 |         full_webhook_url = (
192 |             f"{WEBHOOK_URL.rstrip('/')}/{WEBHOOK_SECRET_PATH.lstrip('/')}"
193 |         )
194 |         logger.info(f"Setting webhook to: {full_webhook_url}")
195 |         try:
196 |             # ptb_app.start() should have registered the webhook if configured
197 |             # Forcing it here to be sure, especially if start() behavior changes
198 |             await ptb_app.bot.set_webhook(
199 |                 url=full_webhook_url,
200 |                 secret_token=os.getenv("TELEGRAM_WEBHOOK_SECRET_TOKEN"),
201 |                 allowed_updates=Update.ALL_TYPES,
202 |             )
203 |             logger.info("Webhook explicitly set successfully.")
204 |         except Exception as e:
205 |             logger.error(f"Failed to set webhook: {e}", exc_info=True)
206 |     else:  # No polling and no WEBHOOK_URL
207 |         logger.warning(
208 |             "USE_POLLING is false and WEBHOOK_URL not set. Bot may not receive updates."
209 |         )
210 | 
211 |     app.state.bot_initialized = True
212 |     logger.info("Bot initialization complete.")
213 | 
214 |     yield
215 | 
216 |     # --- Shutdown ---
217 |     logger.info("Application shutdown...")
218 |     try:
219 |         if polling_task and not polling_task.done():
220 |             logger.info("Polling mode: Stopping PTB polling loop...")
221 |             ptb_app.updater.stop()  # Request stop
222 |             try:
223 |                 await asyncio.wait_for(
224 |                     polling_task, timeout=5.0
225 |                 )  # Wait for task to finish
226 |             except asyncio.TimeoutError:
227 |                 logger.warning("Polling task did not finish in time, cancelling.")
228 |                 polling_task.cancel()
229 |             except Exception as e:
230 |                 logger.error(f"Error stopping polling task: {e}")
231 |             logger.info("PTB polling loop stopped.")
232 |         elif (
233 |             WEBHOOK_URL and not should_use_polling
234 |         ):  # only delete webhook if it was set
235 |             logger.info("Webhook mode: Attempting to delete webhook...")
236 |             try:
237 |                 await ptb_app.bot.delete_webhook(drop_pending_updates=True)
238 |                 logger.info("Webhook deleted successfully.")
239 |             except Exception as e:
240 |                 logger.error(f"Failed to delete webhook: {e}", exc_info=True)
241 | 
242 |         if ptb_app.running:
243 |             await ptb_app.stop()
244 |         await ptb_app.shutdown()
245 |         logger.info("PTB Application components stopped and shut down.")
246 |     except Exception as e:
247 |         logger.error(f"Error during PTB application shutdown: {e}", exc_info=True)
248 | 
249 | 
250 | # --- FastAPI Application Definition ---
251 | app = FastAPI(lifespan=lifespan)
252 | 
253 | 
254 | # --- Webhook Endpoint ---
255 | @app.post(f"/{WEBHOOK_SECRET_PATH}")
256 | async def webhook(
257 |     request: Request,
258 |     secret_token: str | None = Header(None, alias="X-Telegram-Bot-Api-Secret-Token"),
259 | ) -> Response:
260 |     """Handles incoming Telegram updates via webhook."""
261 |     logger.info("Webhook endpoint called")
262 | 
263 |     # --- Webhook Secret Token Verification ---
264 |     TELEGRAM_WEBHOOK_SECRET_TOKEN = os.getenv("TELEGRAM_WEBHOOK_SECRET_TOKEN")
265 |     if TELEGRAM_WEBHOOK_SECRET_TOKEN and secret_token != TELEGRAM_WEBHOOK_SECRET_TOKEN:
266 |         logger.warning(
267 |             f"Invalid secret token received: '{secret_token}' vs expected token"
268 |         )
269 |         raise HTTPException(status_code=403, detail="Invalid secret token")
270 | 
271 |     # Ensure the bot is initialized before processing updates
272 |     if not hasattr(app.state, "bot_initialized") or not app.state.bot_initialized:
273 |         logger.error("Bot not yet initialized. Request rejected.")
274 |         raise HTTPException(status_code=503, detail="Bot initialization in progress")
275 | 
276 |     try:
277 |         # Get the raw request body for logging if needed
278 |         body = await request.body()
279 |         logger.info(f"Received webhook request body length: {len(body)} bytes")
280 | 
281 |         # Parse the request JSON
282 |         update_data = await request.json()
283 |         logger.info(f"Successfully parsed update JSON")
284 | 
285 |         # Convert to Telegram Update object
286 |         update = Update.de_json(update_data, ptb_app.bot)
287 |         logger.info(
288 |             f"Received update: {update.update_id}, type: {type(update).__name__}"
289 |         )
290 | 
291 |         # Extract some basic info for logging
292 |         message = update.message or update.edited_message
293 |         if message:
294 |             logger.info(
295 |                 f"Message content: '{message.text if message.text else '[no text]'}'"
296 |             )
297 | 
298 |         # Process the update
299 |         # logger.info("Processing update with PTB application...")
300 |         # await ptb_app.process_update(update)
301 |         # logger.info(f"Successfully processed update {update.update_id}")
302 | 
303 |         # Kick off processing in the background and ACK Telegram immediately
304 |         logger.info("Scheduling background processing...")
305 |         asyncio.create_task(ptb_app.process_update(update))
306 |         return {"ok": True}  # must be <10 s
307 | 
308 |     except json.JSONDecodeError as e:
309 |         logger.error(f"Failed to parse webhook request JSON: {e}", exc_info=True)
310 |         return {"ok": False, "error": "Invalid JSON"}
311 |     except Exception as e:
312 |         logger.error(f"Error processing update: {e}", exc_info=True)
313 |         # Return 200 even on error to prevent Telegram from retrying too aggressively
314 |         return {"ok": False, "error": str(e)}
315 | 
316 | 
317 | # --- Health Check Endpoint (Good Practice) ---
318 | @app.get("/health")
319 | async def health_check():
320 |     """Basic health check endpoint."""
321 |     logger.info("Health check endpoint called.")
322 |     return {"status": "ok"}
323 | 
324 | 
325 | # --- Main Execution Block (for running with uvicorn) ---
326 | if __name__ == "__main__":
327 |     host = os.getenv("HOST", "0.0.0.0")
328 |     port = int(os.getenv("PORT", "8080"))
329 | 
330 |     # Check if we should run in polling mode (for local testing without webhook)
331 |     use_polling = os.getenv("USE_POLLING", "false").lower() == "true"
332 | 
333 |     if use_polling:
334 |         logger.info("Starting bot in polling mode (from __main__)...")
335 |         # This block is mainly for running `python bot.py` directly.
336 |         # When running with Uvicorn, the lifespan handler above manages polling.
337 | 
338 |         # Set a flag that lifespan can check if needed, though direct call is better.
339 |         os.environ["_SUPERVISOR_USE_POLLING_MODE"] = "1"
340 | 
341 |         async def main_polling_directly():
342 |             global ptb_app
343 |             logger.info(
344 |                 "Initializing PTB application for direct polling (main_polling_directly)..."
345 |             )
346 |             await ptb_app.initialize()
347 |             url_handler = MessageHandler(
348 |                 filters.TEXT & (~filters.COMMAND), handle_message
349 |             )
350 |             ptb_app.add_handler(url_handler)
351 |             await ptb_app.start()
352 |             logger.info("Starting PTB polling loop (main_polling_directly)...")
353 |             try:
354 |                 await ptb_app.updater.start_polling(poll_interval=1.0)
355 |                 while True:  # Keep alive
356 |                     await asyncio.sleep(3600)
357 |             except KeyboardInterrupt:
358 |                 logger.info("Polling stopped by user (main_polling_directly).")
359 |             finally:
360 |                 logger.info("Shutting down PTB from main_polling_directly...")
361 |                 if ptb_app.updater.running:
362 |                     ptb_app.updater.stop()  # stop() is not awaitable here
363 |                 if ptb_app.running:
364 |                     await ptb_app.stop()
365 |                 await ptb_app.shutdown()
366 |                 logger.info("PTB application shut down after direct polling.")
367 | 
368 |         asyncio.run(main_polling_directly())
369 |         if "_SUPERVISOR_USE_POLLING_MODE" in os.environ:
370 |             del os.environ["_SUPERVISOR_USE_POLLING_MODE"]
371 |     else:
372 |         # Run in webhook mode with FastAPI/Uvicorn
373 |         logger.info(f"Starting Uvicorn server on {host}:{port} for webhook mode...")
374 |         uvicorn.run(app, host=host, port=port)
375 | 


--------------------------------------------------------------------------------
/images/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kargarisaac/telegram_link_summarizer_agent/4d60395aca42e37cca330745b80f4a242419a455/images/image.png


--------------------------------------------------------------------------------
/images/system_arch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kargarisaac/telegram_link_summarizer_agent/4d60395aca42e37cca330745b80f4a242419a455/images/system_arch.jpg


--------------------------------------------------------------------------------
/langgraph.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": ["."],
3 |   "graphs": {
4 |     "agent": "./agent.py:graph"
5 |   },
6 |   "env": ".env"
7 | }
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "telegram-link-summarizer-agent"
 3 | version = "0.1.0"
 4 | description = "An agentic Telegram bot to summarize links and papers."
 5 | readme = "README.md"
 6 | requires-python = ">=3.11" # Adjusted to match Docker image
 7 | dependencies = [
 8 |     "baml-py>=0.88.0",
 9 |     "langgraph>=0.0.57",
10 |     "langchain>=0.2.0",
11 |     "langchain-openai>=0.1.7", # Assuming OpenAI, change if needed
12 |     "python-telegram-bot[ext]>=21.0",
13 |     "requests>=2.31.0",
14 |     "pypdf>=4.2.0",
15 |     "python-dotenv>=1.0.1",
16 |     "tavily-python>=0.3.3", # Adding Tavily as it's in config
17 |     # Add any other specific langchain community/experimental packages if used
18 |     "langgraph-checkpoint-sqlite>=2.0.6",
19 |     "langgraph-cli[inmem]>=0.2.7",
20 |     "marimo>=0.13.2",
21 |     "langchain-community>=0.3.22",
22 |     "rich>=14.0.0",
23 |     "loguru>=0.7.3",
24 |     "fastapi>=0.115.12",
25 |     "uvicorn>=0.34.2",
26 |     "pymupdf>=1.25.5",
27 |     "click>=8.1.8",
28 |     "h11>=0.16.0",
29 |     "starlette>=0.46.2",
30 |     "pydantic>=2.11.3",
31 |     "pydantic-core>=2.33.1",
32 |     "typing-extensions>=4.13.2",
33 |     "typing-inspection>=0.4.0",
34 |     "annotated-types>=0.7.0",
35 |     "anyio>=4.9.0",
36 |     "nest-asyncio>=1.6.0",
37 |     "yt-dlp>=2025.4.30",
38 |     "youtube-transcript-api>=1.0.3",
39 |     "google-api-python-client>=2.169.0",
40 |     "google-cloud-secret-manager>=2.20.0", # Added for GCP Secret Manager access
41 |     "bs4>=0.0.2",
42 |     "playwright>=1.52.0",
43 |     "agentql>=1.10.0",
44 |     "beautifulsoup4>=4.13.4",
45 | ]
46 | 
47 | [tool.setuptools]
48 | py-modules = ["agent", "bot", "config"]
49 | packages = ["baml_client", "tools"]
50 | 


--------------------------------------------------------------------------------
/scripts/deploy_cloud_run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # --- Deploy the Telegram Summarizer Bot to Google Cloud Run ---
  4 | 
  5 | set -e # Exit immediately if a command exits with a non-zero status.
  6 | 
  7 | # --- Configuration --- 
  8 | # You can set these environment variables or the script will prompt you.
  9 | # PROJECT_ID="your-gcp-project-id"
 10 | # REGION="your-preferred-region" # e.g., us-central1
 11 | # SERVICE_NAME="telegram-summarizer"
 12 | # REPO_NAME="my-summarizer-bot-repo" # Artifact Registry repo name
 13 | 
 14 | # Define the secrets to map from Secret Manager to Cloud Run environment variables.
 15 | # Format: "ENV_VAR_NAME_IN_CLOUDRUN=SECRET_NAME_IN_MANAGER:latest"
 16 | SECRETS_TO_MAP=(
 17 |   "GEMINI_API_KEY=GEMINI_API_KEY:latest"
 18 |   "DEEPSEEK_API_KEY=DEEPSEEK_API_KEY:latest"
 19 |   "TAVILY_API_KEY=TAVILY_API_KEY:latest"
 20 |   "TWITTER_API_IO_KEY=TWITTER_API_IO_KEY:latest"
 21 |   "AGENTQL_API_KEY=AGENTQL_API_KEY:latest"
 22 |   "TELEGRAM_BOT_TOKEN=TELEGRAM_BOT_TOKEN:latest"
 23 |   "TELEGRAM_WEBHOOK_SECRET_TOKEN=TELEGRAM_WEBHOOK_SECRET_TOKEN:latest"
 24 |   "WEBHOOK_SECRET_PATH=WEBHOOK_SECRET_PATH:latest"
 25 | )
 26 | 
 27 | # --- Script Logic --- 
 28 | 
 29 | # Check dependencies
 30 | if ! command -v gcloud &> /dev/null; then echo "Error: gcloud not found. Please install Google Cloud SDK." >&2; exit 1; fi
 31 | if ! command -v docker &> /dev/null; then echo "Error: docker not found. Please install Docker." >&2; exit 1; fi
 32 | 
 33 | # Get configuration if not set via environment variables
 34 | PROJECT_ID=${PROJECT_ID:-"$(gcloud config get-value project)"}
 35 | if [ -z "${PROJECT_ID}" ]; then read -p "Enter Google Cloud Project ID: " PROJECT_ID; fi
 36 | if [ -z "${PROJECT_ID}" ]; then echo "Error: Project ID is required." >&2; exit 1; fi
 37 | gcloud config set project "$PROJECT_ID"
 38 | 
 39 | REGION=${REGION:-"$(gcloud config get-value run/region)"}
 40 | if [ -z "${REGION}" ]; then read -p "Enter Google Cloud Region (e.g., us-central1): " REGION; fi
 41 | if [ -z "${REGION}" ]; then echo "Error: Region is required." >&2; exit 1; fi
 42 | gcloud config set run/region "$REGION"
 43 | 
 44 | SERVICE_NAME=${SERVICE_NAME:-"telegram-summarizer"}
 45 | read -p "Enter Cloud Run Service Name [${SERVICE_NAME}]: " INPUT_SERVICE_NAME
 46 | SERVICE_NAME=${INPUT_SERVICE_NAME:-$SERVICE_NAME}
 47 | 
 48 | REPO_NAME=${REPO_NAME:-"summarizer-bot-repo"}
 49 | read -p "Enter Artifact Registry Repository Name [${REPO_NAME}]: " INPUT_REPO_NAME
 50 | REPO_NAME=${INPUT_REPO_NAME:-$REPO_NAME}
 51 | 
 52 | # Construct image name
 53 | IMAGE_NAME="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO_NAME}/${SERVICE_NAME}:latest"
 54 | 
 55 | echo "--- Deployment Configuration ---"
 56 | echo "Project ID:       $PROJECT_ID"
 57 | echo "Region:           $REGION"
 58 | echo "Service Name:     $SERVICE_NAME"
 59 | echo "Artifact Repo:    $REPO_NAME"
 60 | echo "Image Name:       $IMAGE_NAME"
 61 | echo "------------------------------"
 62 | read -p "Proceed with deployment? (y/N): " CONFIRM
 63 | if [[ ! "$CONFIRM" =~ ^[Yy]$ ]]; then
 64 |   echo "Deployment cancelled."
 65 |   exit 0
 66 | fi
 67 | 
 68 | # Get the directory of the script itself
 69 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 70 | PROJECT_ROOT="$SCRIPT_DIR/.."
 71 | 
 72 | # Enable APIs
 73 | echo "Enabling required Google Cloud APIs..."
 74 | gcloud services enable run.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com secretmanager.googleapis.com --project="$PROJECT_ID"
 75 | 
 76 | # Create Artifact Registry Repository
 77 | echo "Checking/Creating Artifact Registry repository '$REPO_NAME' in region '$REGION'..."
 78 | if ! gcloud artifacts repositories describe "$REPO_NAME" --location="$REGION" --project="$PROJECT_ID" &> /dev/null; then
 79 |   gcloud artifacts repositories create "$REPO_NAME" \
 80 |     --repository-format=docker \
 81 |     --location="$REGION" \
 82 |     --description="Docker repository for $SERVICE_NAME" \
 83 |     --project="$PROJECT_ID"
 84 |   echo "Created Artifact Registry repository."
 85 | else
 86 |   echo "Artifact Registry repository already exists."
 87 | fi
 88 | 
 89 | # Configure Docker Authentication
 90 | echo "Configuring Docker authentication for $REGION..."
 91 | gcloud auth configure-docker "${REGION}-docker.pkg.dev" --project="$PROJECT_ID"
 92 | 
 93 | # Build the Docker image
 94 | echo "Building Docker image '$IMAGE_NAME' from $PROJECT_ROOT..."
 95 | cd "$PROJECT_ROOT" || exit 1
 96 | docker build -t "$IMAGE_NAME" .
 97 | if [ $? -ne 0 ]; then echo "Error: Docker build failed." >&2; exit 1; fi
 98 | 
 99 | # Push the Docker image
100 | echo "Pushing Docker image to Artifact Registry..."
101 | docker push "$IMAGE_NAME"
102 | if [ $? -ne 0 ]; then echo "Error: Docker push failed." >&2; exit 1; fi
103 | 
104 | # Construct secrets argument
105 | if [ ${#SECRETS_TO_MAP[@]} -eq 0 ]; then
106 |     # This case should not happen with the hardcoded list above
107 |     echo "Internal Error: SECRETS_TO_MAP array is empty in script ($0)." >&2
108 |     exit 1
109 | fi
110 | SECRETS_ARG=$(printf -- "--set-secrets=%s" "$(IFS=,; echo "${SECRETS_TO_MAP[*]}")")
111 | echo "Will map the following secrets to environment variables in Cloud Run:" 
112 | printf "  %s\n" "${SECRETS_TO_MAP[@]}"
113 | 
114 | # Deploy to Cloud Run
115 | echo "Deploying service '$SERVICE_NAME' to Cloud Run in region '$REGION'..."
116 | 
117 | # You can add additional flags here as needed. Example:
118 | # --memory=512Mi     # Specify memory
119 | # --cpu=1            # Specify CPU
120 | # --min-instances=0  # Allows scaling to zero (default, so not explicitly required)
121 | # --max-instances=10 # Maximum number of instances
122 | 
123 | gcloud run deploy "$SERVICE_NAME" \
124 |   --image="$IMAGE_NAME" \
125 |   --platform=managed \
126 |   --region="$REGION" \
127 |   --port=8080 \
128 |   --allow-unauthenticated \
129 |   --memory=1024Mi \
130 |   --min-instances=1 \
131 |   --cpu-throttling \
132 |   $SECRETS_ARG \
133 |   --project="$PROJECT_ID"
134 | 
135 | if [ $? -ne 0 ]; then echo "Error: Cloud Run deployment failed." >&2; exit 1; fi
136 | 
137 | # Get the service URL
138 | SERVICE_URL=$(gcloud run services describe "$SERVICE_NAME" --platform managed --region "$REGION" --format 'value(status.url)' --project="$PROJECT_ID")
139 | echo "Service deployed successfully. URL: $SERVICE_URL"
140 | 
141 | # --- Set Telegram Webhook --- 
142 | echo "Attempting to set Telegram webhook..."
143 | 
144 | # Find the secret IDs for the bot token and webhook path from the mapping
145 | TELEGRAM_BOT_TOKEN_SECRET_ID=""
146 | WEBHOOK_SECRET_PATH_SECRET_ID=""
147 | WEBHOOK_SECRET_TOKEN_SECRET_ID=""
148 | 
149 | for mapping in "${SECRETS_TO_MAP[@]}"; do
150 |   env_var_name=$(echo "$mapping" | cut -d'=' -f1)
151 |   secret_ref=$(echo "$mapping" | cut -d'=' -f2)
152 |   secret_id=$(echo "$secret_ref" | cut -d':' -f1)
153 |   
154 |   if [[ "$(echo "$env_var_name" | tr '[:upper:]' '[:lower:]')" == "telegram_bot_token" ]]; then
155 |     TELEGRAM_BOT_TOKEN_SECRET_ID="$secret_id"
156 |   fi
157 |   # Use the env var name expected by bot.py (which matches the secret name here)
158 |   if [[ "$(echo "$env_var_name" | tr '[:upper:]' '[:lower:]')" == "webhook_secret_path" ]]; then
159 |     WEBHOOK_SECRET_PATH_SECRET_ID="$secret_id"
160 |   fi
161 |   if [[ "$(echo "$env_var_name" | tr '[:upper:]' '[:lower:]')" == "telegram_webhook_secret_token" ]]; then
162 |     WEBHOOK_SECRET_TOKEN_SECRET_ID="$secret_id"
163 |   fi
164 | done
165 | 
166 | if [ -z "$TELEGRAM_BOT_TOKEN_SECRET_ID" ]; then
167 |   echo "Error: Could not find TELEGRAM_BOT_TOKEN mapping in SECRETS_TO_MAP. Cannot set webhook." >&2
168 |   exit 1
169 | fi
170 | 
171 | if [ -z "$WEBHOOK_SECRET_PATH_SECRET_ID" ]; then
172 |   echo "Warning: Could not find WEBHOOK_SECRET_PATH mapping in SECRETS_TO_MAP." >&2
173 |   echo "Will attempt to set webhook using default '/webhook' path." >&2
174 |   # Default path if not found in secrets
175 |   WEBHOOK_PATH_VALUE="/webhook"
176 | else
177 |   echo "Fetching Webhook Secret Path from Secret Manager..."
178 |   WEBHOOK_PATH_VALUE=$(gcloud secrets versions access latest --secret="$WEBHOOK_SECRET_PATH_SECRET_ID" --project="$PROJECT_ID")
179 |   # Ensure path starts with a slash
180 |   if [[ "$WEBHOOK_PATH_VALUE" != /* ]]; then
181 |       WEBHOOK_PATH_VALUE="/$WEBHOOK_PATH_VALUE"
182 |   fi
183 | fi
184 | 
185 | # Fetch the latest version of the secrets
186 | echo "Fetching Telegram Bot Token from Secret Manager..."
187 | TELEGRAM_BOT_TOKEN=$(gcloud secrets versions access latest --secret="$TELEGRAM_BOT_TOKEN_SECRET_ID" --project="$PROJECT_ID")
188 | 
189 | WEBHOOK_SECRET_TOKEN=""
190 | if [ -n "$WEBHOOK_SECRET_TOKEN_SECRET_ID" ]; then
191 |   echo "Fetching Webhook Secret Token from Secret Manager..."
192 |   WEBHOOK_SECRET_TOKEN=$(gcloud secrets versions access latest --secret="$WEBHOOK_SECRET_TOKEN_SECRET_ID" --project="$PROJECT_ID")
193 | fi
194 | 
195 | # Construct webhook URL
196 | FINAL_WEBHOOK_URL="${SERVICE_URL}${WEBHOOK_PATH_VALUE}"
197 | 
198 | echo "Setting webhook to: $FINAL_WEBHOOK_URL"
199 | 
200 | # Use curl to set the webhook
201 | API_URL="https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook"
202 | 
203 | echo "DEBUG: Preparing curl command..."
204 | 
205 | # Construct the complete JSON payload in one string
206 | if [ -n "$WEBHOOK_SECRET_TOKEN" ]; then
207 |   echo "Using webhook secret token."
208 |   JSON_PAYLOAD="{\"url\": \"$FINAL_WEBHOOK_URL\", \"secret_token\": \"$WEBHOOK_SECRET_TOKEN\"}"
209 | else
210 |   echo "No webhook secret token found/used."
211 |   JSON_PAYLOAD="{\"url\": \"$FINAL_WEBHOOK_URL\"}"
212 | fi
213 | 
214 | echo "DEBUG: JSON Payload: $JSON_PAYLOAD"
215 | 
216 | # Function to make the curl request with retries
217 | set_webhook_with_retry() {
218 |   local max_retries=3
219 |   local retry_count=0
220 |   local wait_time=2
221 | 
222 |   while [ $retry_count -lt $max_retries ]; do
223 |     echo "DEBUG: Setting webhook (attempt $((retry_count + 1))/$max_retries)..."
224 |     
225 |     # Make the request
226 |     RESPONSE=$(curl -s -X POST "$API_URL" \
227 |       -H "Content-Type: application/json" \
228 |       -d "$JSON_PAYLOAD")
229 |     
230 |     CURL_EXIT_CODE=$?
231 | 
232 |     # Check for rate limit error
233 |     if [ $CURL_EXIT_CODE -eq 0 ] && echo "$RESPONSE" | grep -q '"error_code":429'; then
234 |       retry_after=$(echo "$RESPONSE" | grep -o '"retry_after":[0-9]*' | grep -o '[0-9]*')
235 |       
236 |       # If retry_after is not found or not a number, use default wait time
237 |       if [ -z "$retry_after" ] || ! [[ "$retry_after" =~ ^[0-9]+$ ]]; then
238 |         retry_after=$wait_time
239 |       fi
240 |       
241 |       echo "Rate limited by Telegram API. Waiting ${retry_after}s before retry..."
242 |       sleep $((retry_after + 1))  # Wait a bit longer than recommended
243 |       retry_count=$((retry_count + 1))
244 |       continue
245 |     fi
246 |     
247 |     # If we get here, either there was no rate limit error or another error occurred
248 |     break
249 |   done
250 |   
251 |   return $CURL_EXIT_CODE
252 | }
253 | 
254 | # Call the function to make the request with retries
255 | set_webhook_with_retry
256 | CURL_EXIT_CODE=$?
257 | 
258 | # For debugging, let's see the response
259 | echo "DEBUG: Webhook response: $RESPONSE"
260 | 
261 | # Check response from Telegram API
262 | if [ $CURL_EXIT_CODE -ne 0 ]; then
263 |     echo "Error: curl command failed with exit code $CURL_EXIT_CODE" >&2
264 |     exit 1
265 | elif echo "$RESPONSE" | grep -q '"ok":true'; then
266 |   echo "Telegram webhook set successfully!"
267 |   echo "Result: $RESPONSE"
268 | elif echo "$RESPONSE" | grep -q '"description":"Webhook is already set"'; then
269 |   # This is also a success case, webhook is properly set
270 |   echo "Telegram webhook was already set to this URL."
271 |   echo "Result: $RESPONSE"
272 | else
273 |   echo "Error setting Telegram webhook." >&2
274 |   echo "URL used: $FINAL_WEBHOOK_URL" >&2
275 |   echo "Check TELEGRAM_BOT_TOKEN, WEBHOOK_SECRET_PATH (if used), and ensure the service URL is correct and publicly accessible." >&2
276 |   echo "Telegram API Response: $RESPONSE" >&2
277 |   exit 1
278 | fi
279 | 
280 | echo "--- Deployment Complete ---"
281 | 


--------------------------------------------------------------------------------
/scripts/deploy_server.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # --- Deploy Telegram Summarizer Bot to Self-Managed Server ---
  4 | 
  5 | set -e # Exit immediately if a command exits with a non-zero status.
  6 | 
  7 | # --- Configuration ---
  8 | SERVER_IP=${SERVER_IP:-"38.54.75.29"}
  9 | CONTAINER_NAME=${CONTAINER_NAME:-"telegram-summarizer"}
 10 | IMAGE_NAME=${IMAGE_NAME:-"telegram-summarizer:latest"}
 11 | HOST_PORT=${HOST_PORT:-"8080"}
 12 | CONTAINER_PORT=${CONTAINER_PORT:-"8080"}
 13 | 
 14 | echo "--- Server Deployment Configuration ---"
 15 | echo "Server IP:        $SERVER_IP"
 16 | echo "Container Name:   $CONTAINER_NAME"
 17 | echo "Image Name:       $IMAGE_NAME"
 18 | echo "Host Port:        $HOST_PORT"
 19 | echo "Container Port:   $CONTAINER_PORT"
 20 | echo "----------------------------------------"
 21 | 
 22 | # Get the directory of the script itself
 23 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 24 | PROJECT_ROOT="$SCRIPT_DIR/.."
 25 | 
 26 | # Check if .env file exists
 27 | if [ ! -f "$PROJECT_ROOT/.env" ]; then
 28 |     echo "Error: .env file not found in project root. Please create it with your environment variables." >&2
 29 |     exit 1
 30 | fi
 31 | 
 32 | echo "Found .env file. Proceeding with deployment..."
 33 | 
 34 | # Stop and remove existing container if it exists
 35 | echo "Stopping and removing existing container (if any)..."
 36 | docker stop "$CONTAINER_NAME" 2>/dev/null || true
 37 | docker rm "$CONTAINER_NAME" 2>/dev/null || true
 38 | 
 39 | # Remove old image to ensure we build fresh
 40 | echo "Removing old image (if any)..."
 41 | docker rmi "$IMAGE_NAME" 2>/dev/null || true
 42 | 
 43 | # Build the Docker image
 44 | echo "Building Docker image '$IMAGE_NAME'..."
 45 | cd "$PROJECT_ROOT" || exit 1
 46 | docker build -t "$IMAGE_NAME" .
 47 | if [ $? -ne 0 ]; then 
 48 |     echo "Error: Docker build failed." >&2
 49 |     exit 1
 50 | fi
 51 | 
 52 | # Run the container
 53 | echo "Starting container '$CONTAINER_NAME'..."
 54 | docker run -d \
 55 |     --name "$CONTAINER_NAME" \
 56 |     --restart unless-stopped \
 57 |     -p "$HOST_PORT:$CONTAINER_PORT" \
 58 |     --env-file .env \
 59 |     "$IMAGE_NAME"
 60 | 
 61 | if [ $? -ne 0 ]; then
 62 |     echo "Error: Failed to start container." >&2
 63 |     exit 1
 64 | fi
 65 | 
 66 | echo "Container started successfully!"
 67 | 
 68 | # Wait a moment for the container to start
 69 | sleep 5
 70 | 
 71 | # Check container status
 72 | echo "Checking container status..."
 73 | docker ps | grep "$CONTAINER_NAME" || {
 74 |     echo "Error: Container is not running. Checking logs..."
 75 |     docker logs "$CONTAINER_NAME"
 76 |     exit 1
 77 | }
 78 | 
 79 | # Check health endpoint
 80 | echo "Checking health endpoint..."
 81 | sleep 10  # Give the app time to start
 82 | if curl -f "http://localhost:$HOST_PORT/health" >/dev/null 2>&1; then
 83 |     echo "✅ Health check passed!"
 84 | else
 85 |     echo "⚠️  Health check failed. Checking logs..."
 86 |     docker logs --tail 20 "$CONTAINER_NAME"
 87 | fi
 88 | 
 89 | # Set Telegram webhook
 90 | echo "Setting Telegram webhook..."
 91 | if [ -f "$PROJECT_ROOT/.env" ]; then
 92 |     # Source the .env file to get variables
 93 |     set -a  # automatically export all variables
 94 |     source "$PROJECT_ROOT/.env"
 95 |     set +a  # stop automatically exporting
 96 |     
 97 |     if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$WEBHOOK_URL" ] && [ -n "$WEBHOOK_SECRET_PATH" ]; then
 98 |         FULL_WEBHOOK_URL="${WEBHOOK_URL}${WEBHOOK_SECRET_PATH}"
 99 |         echo "Setting webhook to: $FULL_WEBHOOK_URL"
100 |         
101 |         # Prepare JSON payload
102 |         if [ -n "$TELEGRAM_WEBHOOK_SECRET_TOKEN" ]; then
103 |             JSON_PAYLOAD="{\"url\": \"$FULL_WEBHOOK_URL\", \"secret_token\": \"$TELEGRAM_WEBHOOK_SECRET_TOKEN\"}"
104 |         else
105 |             JSON_PAYLOAD="{\"url\": \"$FULL_WEBHOOK_URL\"}"
106 |         fi
107 |         
108 |         # Set webhook
109 |         RESPONSE=$(curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook" \
110 |             -H "Content-Type: application/json" \
111 |             -d "$JSON_PAYLOAD")
112 |         
113 |         if echo "$RESPONSE" | grep -q '"ok":true'; then
114 |             echo "✅ Telegram webhook set successfully!"
115 |         else
116 |             echo "⚠️  Failed to set Telegram webhook. Response: $RESPONSE"
117 |         fi
118 |     else
119 |         echo "⚠️  Missing webhook configuration in .env file. Please set webhook manually."
120 |     fi
121 | fi
122 | 
123 | echo ""
124 | echo "--- Deployment Complete ---"
125 | echo "Container: $CONTAINER_NAME"
126 | echo "Status: $(docker inspect -f '{{.State.Status}}' $CONTAINER_NAME)"
127 | echo "Logs: docker logs $CONTAINER_NAME"
128 | echo "Stop: docker stop $CONTAINER_NAME"
129 | echo "Restart: docker restart $CONTAINER_NAME"
130 | echo ""
131 | echo "Your bot should now be accessible at: http://$SERVER_IP:$HOST_PORT"
132 | echo "Health check: http://$SERVER_IP:$HOST_PORT/health" 


--------------------------------------------------------------------------------
/scripts/run_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --- Build and Run the Telegram Summarizer Bot using Docker ---
 4 | 
 5 | IMAGE_NAME="telegram-summarizer"
 6 | CONTAINER_NAME="summarizer-bot"
 7 | 
 8 | # Get the directory of the script itself
 9 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
10 | # Go one level up to the project root
11 | PROJECT_ROOT="$SCRIPT_DIR/.."
12 | 
13 | ENV_FILE="$PROJECT_ROOT/.env"
14 | 
15 | if [ ! -f "$ENV_FILE" ]; then
16 |     echo "Error: .env file not found at $ENV_FILE" >&2
17 |     echo "Please create and configure the .env file before running." >&2
18 |     exit 1
19 | fi
20 | 
21 | echo "Building the Docker image ($IMAGE_NAME)..."
22 | cd "$PROJECT_ROOT" || exit 1
23 | docker build -t "$IMAGE_NAME" .
24 | 
25 | if [ $? -ne 0 ]; then
26 |     echo "Error: Docker build failed." >&2
27 |     exit 1
28 | fi
29 | 
30 | echo "Stopping and removing existing container named '$CONTAINER_NAME' (if any)..."
31 | docker stop "$CONTAINER_NAME" > /dev/null 2>&1
32 | docker rm "$CONTAINER_NAME" > /dev/null 2>&1
33 | 
34 | echo "Running the Docker container ($CONTAINER_NAME) with .env file..."
35 | echo "Access the health check at http://localhost:8080/health"
36 | 
37 | docker run -p 8080:8080 --rm --name "$CONTAINER_NAME" --env-file "$ENV_FILE" "$IMAGE_NAME"
38 | 
39 | if [ $? -ne 0 ]; then
40 |     echo "Error: Failed to run Docker container." >&2
41 |     exit 1
42 | fi
43 | 


--------------------------------------------------------------------------------
/scripts/run_local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --- Run the Telegram Summarizer Bot Locally (without Docker) ---
 4 | 
 5 | echo "Starting the Telegram Summarizer Bot using uvicorn..."
 6 | echo "Ensure you have installed dependencies using 'uv sync'"
 7 | echo "Ensure your .env file is configured in the project root."
 8 | 
 9 | # Get the directory of the script itself
10 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
11 | # Go one level up to the project root
12 | PROJECT_ROOT="$SCRIPT_DIR/.."
13 | 
14 | # Run uvicorn from the project root
15 | cd "$PROJECT_ROOT" || exit 1
16 | uvicorn bot:app --host 0.0.0.0 --port 8080 --reload
17 | 


--------------------------------------------------------------------------------
/scripts/setup_secrets.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # --- Setup Secrets in Google Cloud Secret Manager ---
  4 | 
  5 | # Define the specific secrets used by this application
  6 | SECRETS=(
  7 |   "GEMINI_API_KEY"
  8 |   "DEEPSEEK_API_KEY"
  9 |   "TAVILY_API_KEY"
 10 |   "TWITTER_API_IO_KEY"
 11 |   "AGENTQL_API_KEY"
 12 |   "TELEGRAM_BOT_TOKEN"
 13 |   "TELEGRAM_WEBHOOK_SECRET_TOKEN"
 14 |   "WEBHOOK_SECRET_PATH"
 15 | )
 16 | 
 17 | # --- You shouldn't need to edit below this line --- 
 18 | 
 19 | # Check if gcloud is installed
 20 | if ! command -v gcloud &> /dev/null; then
 21 |     echo "Error: gcloud command not found. Please install the Google Cloud SDK." >&2
 22 |     exit 1
 23 | fi
 24 | 
 25 | # Get PROJECT_ID if not set
 26 | if [ -z "${PROJECT_ID}" ]; then
 27 |   read -p "Enter your Google Cloud Project ID: " PROJECT_ID
 28 |   if [ -z "${PROJECT_ID}" ]; then
 29 |     echo "Error: Project ID cannot be empty." >&2
 30 |     exit 1
 31 |   fi
 32 |   export PROJECT_ID
 33 |   gcloud config set project "$PROJECT_ID"
 34 | fi
 35 | 
 36 | echo "Using Project ID: $PROJECT_ID"
 37 | 
 38 | echo "Enabling Secret Manager API (if not already enabled)..."
 39 | gcloud services enable secretmanager.googleapis.com --project="$PROJECT_ID"
 40 | 
 41 | # --- Grant Secret Accessor Role to Default Compute Service Account --- 
 42 | echo "Fetching Project Number for $PROJECT_ID..."
 43 | PROJECT_NUMBER=$(gcloud projects describe "$PROJECT_ID" --format='value(projectNumber)')
 44 | 
 45 | if [ -z "$PROJECT_NUMBER" ]; then
 46 |   echo "Error: Could not fetch Project Number for Project ID $PROJECT_ID." >&2
 47 |   echo "Please ensure the Project ID is correct and you have permissions." >&2
 48 |   exit 1
 49 | fi
 50 | 
 51 | SERVICE_ACCOUNT_EMAIL="${PROJECT_NUMBER}-compute@developer.gserviceaccount.com"
 52 | ROLE_TO_GRANT="roles/secretmanager.secretAccessor"
 53 | 
 54 | echo "Checking if service account $SERVICE_ACCOUNT_EMAIL has role $ROLE_TO_GRANT..."
 55 | # Check current policy binding (suppress errors if role isn't found)
 56 | if ! gcloud projects get-iam-policy "$PROJECT_ID" \
 57 |   --flatten="bindings[].members" \
 58 |   --format='table(bindings.role)' \
 59 |   --filter="bindings.members:$SERVICE_ACCOUNT_EMAIL AND bindings.role:$ROLE_TO_GRANT" 2>/dev/null | grep -q "$ROLE_TO_GRANT"; then 
 60 | 
 61 |   echo "Granting '$ROLE_TO_GRANT' to service account '$SERVICE_ACCOUNT_EMAIL' on project '$PROJECT_ID'..."
 62 |   gcloud projects add-iam-policy-binding "$PROJECT_ID" \
 63 |       --member="serviceAccount:$SERVICE_ACCOUNT_EMAIL" \
 64 |       --role="$ROLE_TO_GRANT" \
 65 |       --condition=None # Explicitly setting no condition
 66 | 
 67 |   if [ $? -ne 0 ]; then
 68 |     echo "Error: Failed to grant IAM role $ROLE_TO_GRANT to $SERVICE_ACCOUNT_EMAIL." >&2
 69 |     echo "Please check permissions or grant the role manually via the Google Cloud Console." >&2
 70 |     # Decide if you want to exit or continue
 71 |     # exit 1 
 72 |   else
 73 |     echo "IAM role granted successfully."
 74 |   fi
 75 | else
 76 |     echo "Service account already has the required role."
 77 | fi
 78 | # --- End Grant Role ---
 79 | 
 80 | 
 81 | if [ ${#SECRETS[@]} -eq 0 ]; then
 82 |     echo "Internal Error: SECRETS array is empty in script ($0)." >&2
 83 |     exit 1
 84 | fi
 85 | 
 86 | 
 87 | for SECRET_NAME in "${SECRETS[@]}"; do
 88 |   echo "-------------------------------------"
 89 |   echo "Processing Secret: $SECRET_NAME"
 90 |   
 91 |   # Check if secret exists
 92 |   if gcloud secrets describe "$SECRET_NAME" --project="$PROJECT_ID" &> /dev/null; then
 93 |     echo "Secret '$SECRET_NAME' already exists."
 94 |     read -p "Do you want to add a new version with a new value? (y/N): " ADD_VERSION_CONFIRM
 95 |     if [[ "$ADD_VERSION_CONFIRM" =~ ^[Yy]$ ]]; then
 96 |       # Add new version
 97 |       # Prompt for the secret value without echoing to the terminal
 98 |       echo -n "Enter the new value for secret '$SECRET_NAME': "
 99 |       read -s SECRET_VALUE 
100 |       echo # Add a newline after reading the secret
101 |       if [ -z "$SECRET_VALUE" ]; then
102 |          echo "Warning: Secret value is empty. Skipping adding new version for '$SECRET_NAME'." >&2
103 |       else
104 |          printf "%s" "$SECRET_VALUE" | gcloud secrets versions add "$SECRET_NAME" --data-file=- --project="$PROJECT_ID"
105 |          echo "Added new version to secret '$SECRET_NAME'."
106 |       fi
107 |     else
108 |       echo "Skipping secret '$SECRET_NAME'."
109 |     fi
110 |   else
111 |     # Create secret
112 |     echo "Secret '$SECRET_NAME' does not exist. Creating it..."
113 |     gcloud secrets create "$SECRET_NAME" --replication-policy="automatic" --project="$PROJECT_ID"
114 |     if [ $? -ne 0 ]; then
115 |         echo "Error: Failed to create secret '$SECRET_NAME'." >&2
116 |         continue # Skip to the next secret
117 |     fi
118 |     echo "Created secret '$SECRET_NAME'."
119 | 
120 |     # Add the first version
121 |     # Prompt for the secret value without echoing to the terminal
122 |     echo -n "Enter the value for secret '$SECRET_NAME': "
123 |     read -s SECRET_VALUE
124 |     echo 
125 |     if [ -z "$SECRET_VALUE" ]; then
126 |         echo "Warning: Secret value is empty. Creating secret '$SECRET_NAME' with no initial version." >&2
127 |     else
128 |         printf "%s" "$SECRET_VALUE" | gcloud secrets versions add "$SECRET_NAME" --data-file=- --project="$PROJECT_ID"
129 |         echo "Added initial version to secret '$SECRET_NAME'."
130 |     fi
131 |   fi
132 | done
133 | 
134 | echo "-------------------------------------"
135 | echo "Secret setup process complete."
136 | echo "Remember to grant your Cloud Run service account (PROJECT_NUMBER-compute@developer.gserviceaccount.com) the 'Secret Manager Secret Accessor' role for these secrets."
137 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kargarisaac/telegram_link_summarizer_agent/4d60395aca42e37cca330745b80f4a242419a455/tools/__init__.py


--------------------------------------------------------------------------------
/tools/linkedin_agentql_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | linkedin_agentql_scraper.py
  3 | 
  4 | Scrape a public LinkedIn post’s author name and full body text using Playwright + AgentQL.
  5 | 
  6 | ⚠️  Limitations
  7 |     * Works only for *public* posts (i.e. visible to signed‑out visitors).
  8 |     * For private / connection‑only posts, you must authenticate first.
  9 | 
 10 | Prerequisites:
 11 |     pip install playwright agentql
 12 |     playwright install
 13 |     export AGENTQL_API_KEY=<your AgentQL API key>
 14 | 
 15 | Usage:
 16 |     python linkedin_agentql_scraper.py --url "https://www.linkedin.com/posts/..." [--headless]
 17 | 
 18 | The script prints a JSON‑style dict with keys ``author`` and ``content``.
 19 | """
 20 | 
 21 | from __future__ import annotations
 22 | 
 23 | import argparse
 24 | import os
 25 | import textwrap
 26 | from dotenv import load_dotenv
 27 | import agentql
 28 | from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
 29 | 
 30 | load_dotenv()
 31 | 
 32 | 
 33 | def block_resources(route):
 34 |     if route.request.resource_type in ["image", "stylesheet", "font"]:
 35 |         route.abort()
 36 |     else:
 37 |         route.continue_()
 38 | 
 39 | 
 40 | def scrape_linkedin_post(url: str, headless: bool = True) -> dict[str, str]:
 41 |     """Return the post’s author name and full text content."""
 42 | 
 43 |     # 0. Configure AgentQL
 44 |     agentql.configure(api_key=os.getenv("AGENTQL_API_KEY", ""))
 45 | 
 46 |     with sync_playwright() as p:
 47 |         browser = p.chromium.launch(
 48 |             headless=headless,
 49 |             args=["--no-sandbox"],
 50 |         )
 51 |         page = agentql.wrap(browser.new_page())
 52 | 
 53 |         # 1. Navigate & wait for DOM ready
 54 |         try:
 55 |             # Increased timeout to 60 seconds
 56 |             page.route("**/*", block_resources)
 57 |             page.goto(url, wait_until="domcontentloaded", timeout=60000)
 58 |         except PlaywrightTimeoutError as e:
 59 |             print(f"Timeout during page.goto: {e}")
 60 |             try:
 61 |                 # Attempt to save a screenshot for debugging
 62 |                 screenshot_path = "linkedin_timeout_screenshot.png"
 63 |                 page.screenshot(path=screenshot_path)
 64 |                 print(f"Screenshot saved to {screenshot_path}")
 65 |             except Exception as se:
 66 |                 print(f"Failed to save screenshot: {se}")
 67 |             browser.close()  # Ensure browser is closed on error
 68 |             raise  # Re-raise the original timeout error
 69 |         except Exception as e:
 70 |             print(f"An unexpected error occurred during page.goto: {e}")
 71 |             browser.close()  # Ensure browser is closed on error
 72 |             raise  # Re-raise the original error
 73 | 
 74 |         page.wait_for_page_ready_state()
 75 | 
 76 |         # 2. Accept cookies / privacy banner if shown (EU visitors)
 77 |         try:
 78 |             banner = page.query_elements(
 79 |                 """
 80 |                 {
 81 |                     accept_cookies_btn
 82 |                 }
 83 |                 """
 84 |             )
 85 |             banner.accept_cookies_btn.click(timeout=3000)
 86 |         except Exception:
 87 |             # Fallback Locator
 88 |             try:
 89 |                 page.locator("button:has-text('Accept cookies')").click(timeout=3000)
 90 |             except PlaywrightTimeoutError:
 91 |                 pass  # No banner
 92 | 
 93 |         # 3. Expand "…see more" inside the post body (if truncated)
 94 |         try:
 95 |             more = page.query_elements(
 96 |                 """
 97 |                 {
 98 |                     expand_post_body_btn
 99 |                 }
100 |                 """
101 |             )
102 |             more.expand_post_body_btn.click(timeout=3000)
103 |         except Exception:
104 |             try:
105 |                 page.locator("button:has-text('see more')").first.click(timeout=3000)
106 |             except PlaywrightTimeoutError:
107 |                 pass
108 | 
109 |         # 4. Extract author & content via AgentQL
110 |         data = page.query_data(
111 |             """
112 |             {
113 |                 author_name
114 |                 post_body_text
115 |             }
116 |             """
117 |         )
118 | 
119 |         browser.close()
120 | 
121 |         return {
122 |             "author": data.get("author_name", ""),
123 |             "content": textwrap.dedent(data.get("post_body_text", "")).strip(),
124 |         }
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     parser = argparse.ArgumentParser(
129 |         description="Scrape a LinkedIn post via Playwright + AgentQL. Works for public posts only."
130 |     )
131 |     parser.add_argument("--url", required=True, help="Public LinkedIn post URL")
132 |     parser.add_argument(
133 |         "--headless",
134 |         action="store_true",
135 |         help="Run browser in headless mode (default: GUI).",
136 |     )
137 |     args = parser.parse_args()
138 | 
139 |     result = scrape_linkedin_post(args.url, headless=args.headless)
140 |     print("\n=== RESULT ===")
141 |     print("Author:", result["author"])
142 |     print("\nPost text:\n", result["content"])
143 | 


--------------------------------------------------------------------------------
/tools/pdf_handler.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import fitz  # PyMuPDF
 3 | 
 4 | def get_pdf_text(url: str) -> str:
 5 |     """Downloads a PDF from a URL and extracts its text content.
 6 | 
 7 |     Args:
 8 |         url: The URL of the PDF file.
 9 | 
10 |     Returns:
11 |         The extracted text content of the PDF.
12 |         Returns an error message string if download or processing fails.
13 |     """
14 |     try:
15 |         response = requests.get(url, stream=True, timeout=30) # Add timeout
16 |         response.raise_for_status()  # Raise an exception for bad status codes
17 | 
18 |         # Check content type to ensure it's a PDF before downloading fully
19 |         content_type = response.headers.get('Content-Type', '').lower()
20 |         if 'application/pdf' not in content_type:
21 |             return f"Error: URL does not point to a PDF file (Content-Type: {content_type})"
22 | 
23 |         # Read the content into memory
24 |         pdf_content = response.content
25 |         pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
26 | 
27 |         text = ""
28 |         for page_num in range(len(pdf_document)):
29 |             page = pdf_document.load_page(page_num)
30 |             text += page.get_text()
31 | 
32 |         pdf_document.close()
33 |         return text
34 | 
35 |     except requests.exceptions.RequestException as e:
36 |         return f"Error downloading PDF: {e}"
37 |     except fitz.errors.FitzError as e: # Catch PyMuPDF specific errors
38 |         return f"Error processing PDF: {e}"
39 |     except Exception as e:
40 |         return f"An unexpected error occurred: {e}"
41 | 
42 | # Example usage (optional, for testing):
43 | if __name__ == '__main__':
44 |     # Replace with a valid PDF URL for testing
45 |     test_url = "https://arxiv.org/pdf/1706.03762.pdf" # Example: Attention is All You Need paper
46 |     extracted_text = get_pdf_text(test_url)
47 |     if extracted_text.startswith("Error:"):
48 |         print(extracted_text)
49 |     else:
50 |         print("Successfully extracted text:")
51 |         # Print first 500 characters as a sample
52 |         print(extracted_text[:500] + "...")
53 | 


--------------------------------------------------------------------------------
/tools/search.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from rich.console import Console
 3 | from tavily import TavilyClient
 4 | from dotenv import load_dotenv
 5 | 
 6 | # Load environment variables from .env file
 7 | load_dotenv(override=True)
 8 | 
 9 | console = Console()
10 | 
11 | # --- Tavily Client Initialization ---
12 | TAVILY_API_KEY: str | None = os.getenv("TAVILY_API_KEY")
13 | tavily_client = None
14 | if TAVILY_API_KEY:
15 |     try:
16 |         # Initialize TavilyClient
17 |         tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
18 |         console.print("Tavily client initialized in tools/search.py.", style="bold green")
19 |     except Exception as e:
20 |         console.print(f"Failed to initialize Tavily client: {e}", style="bold red", exc_info=True)
21 | else:
22 |     console.print("TAVILY_API_KEY not found in config. Tavily tool disabled.", style="bold yellow")
23 | 
24 | def run_tavily_tool(mode: str, query: str = None, urls: list[str] = None, **kwargs) -> dict | str:
25 |     """
26 |     Uses Tavily client to perform search or extract content from URLs.
27 | 
28 |     Args:
29 |         mode: The operation mode ('search' or 'extract').
30 |         query: The search query (required for 'search' mode).
31 |         urls: A list of URLs to extract content from (required for 'extract' mode).
32 |         **kwargs: Additional parameters for the Tavily API (e.g., max_results, topic, time_range for search).
33 | 
34 |     Returns:
35 |         A dictionary containing the results (search or extract), or an error string.
36 |     """
37 |     if not tavily_client:
38 |         console.print("Tavily client not initialized. Cannot perform operation.", style="bold red")
39 |         return "Error: Tavily client is not available. Check API Key."
40 | 
41 |     try:
42 |         if mode == 'search':
43 |             if not query:
44 |                 return "Error: Query is required for search mode."
45 |             results = tavily_client.search(query=query, **kwargs)
46 |             console.print(f"Tavily search successful for query: '{query}'", style="green")
47 | 
48 |         elif mode == 'extract':
49 |             if not urls:
50 |                 return "Error: URLs are required for extract mode."
51 |             results = tavily_client.extract(urls=urls, **kwargs)
52 |             console.print(f"Tavily extract successful for URLs: {urls}", style="green")
53 | 
54 |         else:
55 |             return f"Error: Invalid mode '{mode}'. Use 'search' or 'extract'."
56 | 
57 |         if not results:
58 |             console.print(f"Tavily {mode} returned no results.", style="bold yellow")
59 |             return f"Error: Tavily {mode} found no information."
60 | 
61 |         # The SDK returns a dictionary directly
62 |         return results
63 | 
64 |     except Exception as e:
65 |         console.print(f"Tavily {mode} failed: {e}", style="bold red", exc_info=True)
66 |         return f"Error: Tavily {mode} encountered an error. {e}"
67 | 
68 | # Example for testing (optional)
69 | if __name__ == "__main__":
70 |     # --- Test Search ---
71 |     # test_query = "Find the recent blog post from Dario Amodei about AI Interpretability"
72 |     # console.print(f"\n--- Testing Tavily Search for: '{test_query}' ---", style="bold blue")
73 |     # search_results = run_tavily_tool(mode='search', query=test_query, topic="news", max_results=3)
74 |     # console.print("Search results:", style="bold green")
75 |     # console.print(search_results)
76 |     # console.print("--- End Search Test ---", style="bold blue")
77 | 
78 |     # --- Test Extract --- 
79 |     # Example URLs (replace with valid ones if needed)
80 |     test_urls_for_extract = [
81 |         "https://www.darioamodei.com/post/the-urgency-of-interpretability"
82 |     ]
83 |     console.print(f"\n--- Testing Tavily Extract for URLs: {test_urls_for_extract} ---", style="bold blue")
84 |     extract_results = run_tavily_tool(mode='extract', urls=test_urls_for_extract)
85 |     console.print("Extract results:", style="bold green")
86 |     console.print(extract_results)
87 |     console.print("--- End Extract Test ---", style="bold blue")
88 | 


--------------------------------------------------------------------------------
/tools/twitter_api_tool.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import requests
  4 | from datetime import datetime, timezone
  5 | from dotenv import load_dotenv
  6 | from rich.console import Console
  7 | 
  8 | load_dotenv()
  9 | console = Console()
 10 | 
 11 | 
 12 | def _parse_twitter_datetime(datetime_str: str) -> datetime:
 13 |     """Parses Twitter's datetime string into a timezone-aware datetime object."""
 14 |     # Format: 'Thu May 01 12:03:30 +0000 2025'
 15 |     # Need to handle the +0000 timezone correctly
 16 |     try:
 17 |         # Standard format doesn't handle '+0000' directly as %z prior to Python 3.7/3.11 depending on platform?
 18 |         # Let's parse manually or use a robust library if needed.
 19 |         # For simplicity, assuming UTC (+0000)
 20 |         dt_naive = datetime.strptime(datetime_str, "%a %b %d %H:%M:%S +0000 %Y")
 21 |         return dt_naive.replace(tzinfo=timezone.utc)
 22 |     except ValueError:
 23 |         console.print(f"Error parsing datetime string: {datetime_str}", style="red")
 24 |         # Return epoch as a fallback to allow sorting even if parsing fails
 25 |         return datetime.fromtimestamp(0, tz=timezone.utc)
 26 | 
 27 | 
 28 | def fetch_tweet_thread(url: str) -> str:
 29 |     """
 30 |     Fetches the content of a tweet and its potential thread using twitterapi.io.
 31 | 
 32 |     Args:
 33 |         url: The URL of the tweet.
 34 | 
 35 |     Returns:
 36 |         A string containing the formatted tweet thread, or an error message starting with "Error:".
 37 |     """
 38 |     API_BASE_URL = "https://api.twitterapi.io"
 39 |     # Try reading with underscore first (common in container envs), fallback to hyphen
 40 |     API_KEY = os.getenv("TWITTER_API_IO_KEY")
 41 | 
 42 |     if not API_KEY:
 43 |         # Update error message to reflect both attempts
 44 |         return "Error: TWITTER_API_IO_KEY not found in environment variables."
 45 | 
 46 |     # 1. Extract Tweet ID
 47 |     match = re.search(r"/status(?:es)?/(\d+)", url)
 48 |     if not match:
 49 |         return f"Error: Could not extract Tweet ID from URL: {url}"
 50 |     tweet_id = match.group(1)
 51 |     console.print(f"Extracted Tweet ID: {tweet_id}", style="cyan")
 52 | 
 53 |     headers = {"X-API-Key": API_KEY}
 54 |     all_tweets = []
 55 |     conversation_id = None
 56 |     main_tweet_data = None
 57 | 
 58 |     # 2. Fetch the main tweet
 59 |     try:
 60 |         console.print(f"Fetching main tweet ID: {tweet_id}", style="cyan")
 61 |         main_tweet_url = f"{API_BASE_URL}/twitter/tweets"
 62 |         params = {"tweet_ids": [tweet_id]}
 63 |         response = requests.get(main_tweet_url, headers=headers, params=params)
 64 |         response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
 65 | 
 66 |         data = response.json()
 67 | 
 68 |         if data.get("status") != "success" or not data.get("tweets"):
 69 |             error_msg = data.get("msg", "Unknown error")
 70 |             return f"Error: Failed to fetch main tweet {tweet_id}. API Status: {data.get('status')}, Msg: {error_msg}"
 71 | 
 72 |         main_tweet_data = data["tweets"][0]
 73 |         all_tweets.append(main_tweet_data)
 74 |         conversation_id = main_tweet_data.get("conversationId")
 75 |         console.print(
 76 |             f"Main tweet fetched. Conversation ID: {conversation_id}", style="green"
 77 |         )
 78 | 
 79 |     except requests.exceptions.RequestException as e:
 80 |         return f"Error: Network or API error fetching main tweet {tweet_id}: {e}"
 81 |     except Exception as e:
 82 |         return f"Error: Unexpected error processing main tweet response: {e}"
 83 | 
 84 |     # 3. Fetch the conversation/thread if conversationId is valid and different from tweet_id
 85 |     #    (A single tweet's conversationId is often its own tweet_id)
 86 |     if conversation_id and conversation_id != tweet_id:
 87 |         try:
 88 |             console.print(
 89 |                 f"Fetching conversation thread ID: {conversation_id}", style="cyan"
 90 |             )
 91 |             thread_url = f"{API_BASE_URL}/twitter/tweet/advanced_search"
 92 |             params = {
 93 |                 "query": f"conversation_id:{conversation_id}",
 94 |                 # Optionally add 'sort_order': 'recency' if API supports it for chronological order
 95 |                 # Add other filters if needed, like 'since_id' using the main tweet ID?
 96 |                 # Check API docs for best way to get replies *after* the main tweet
 97 |             }
 98 |             response = requests.get(thread_url, headers=headers, params=params)
 99 |             response.raise_for_status()
100 | 
101 |             data = response.json()
102 | 
103 |             if data.get("status") == "success" and data.get("tweets"):
104 |                 thread_tweets = data["tweets"]
105 |                 # Filter out the main tweet if it's included in the conversation results
106 |                 filtered_thread_tweets = [
107 |                     t for t in thread_tweets if t.get("id") != tweet_id
108 |                 ]
109 |                 all_tweets.extend(filtered_thread_tweets)
110 |                 console.print(
111 |                     f"Fetched {len(filtered_thread_tweets)} additional tweets in conversation.",
112 |                     style="green",
113 |                 )
114 |             elif data.get("status") != "success":
115 |                 console.print(
116 |                     f"Warning: Failed to fetch conversation thread {conversation_id}. API Status: {data.get('status')}, Msg: {data.get('msg', 'Unknown error')}",
117 |                     style="yellow",
118 |                 )
119 |                 # Proceed with only the main tweet
120 | 
121 |         except requests.exceptions.RequestException as e:
122 |             console.print(
123 |                 f"Warning: Network or API error fetching conversation thread {conversation_id}: {e}",
124 |                 style="yellow",
125 |             )
126 |             # Proceed with only the main tweet
127 |         except Exception as e:
128 |             console.print(
129 |                 f"Warning: Unexpected error processing conversation thread response: {e}",
130 |                 style="yellow",
131 |             )
132 |             # Proceed with only the main tweet
133 | 
134 |     # 4. Sort tweets by creation date
135 |     all_tweets.sort(
136 |         key=lambda t: _parse_twitter_datetime(
137 |             t.get("createdAt", "Thu Jan 01 00:00:00 +0000 1970")
138 |         )
139 |     )
140 | 
141 |     # 5. Format the output
142 |     output_lines = []
143 |     for i, tweet in enumerate(all_tweets):
144 |         author_info = tweet.get("author", {})
145 |         username = author_info.get("userName", "unknown_user")
146 |         created_at_str = tweet.get("createdAt", "Unknown time")
147 |         text = tweet.get("text", "").strip()
148 | 
149 |         # Basic formatting
150 |         line = f"Tweet {i + 1}/{len(all_tweets)} by @{username} ({created_at_str}):\n{text}\n---"
151 |         output_lines.append(line)
152 | 
153 |     if not output_lines:
154 |         return f"Error: No tweet data could be formatted for tweet ID {tweet_id}."  # Should not happen if main tweet fetch succeeded
155 | 
156 |     return "\n".join(output_lines).strip()
157 | 
158 | 
159 | # Example usage (for testing this script directly)
160 | if __name__ == "__main__":
161 |     # Test with a known tweet URL (replace with a real one, potentially a thread)
162 |     # test_url_single = "https://x.com/levelsio/status/1798629243934064791" # Example single tweet
163 |     test_url_thread_start = "https://x.com/omarsar0/status/1917939469103305013?s=52"  # Example thread start (replace if needed)
164 | 
165 |     print(f"--- Testing with URL: {test_url_thread_start} ---")
166 |     result = fetch_tweet_thread(test_url_thread_start)
167 |     print("\n--- RESULT ---")
168 |     print(result)
169 |     print("--- END TEST ---")
170 | 


--------------------------------------------------------------------------------
/tools/youtube_agentql_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | youtube_agentql_scraper.py
  3 | 
  4 | Scrape a YouTube video's title and full description using Playwright + AgentQL.
  5 | 
  6 | Prerequisites:
  7 |     pip install playwright agentql
  8 |     playwright install
  9 |     export AGENTQL_API_KEY=<your AgentQL API key>
 10 | 
 11 | Usage:
 12 |     python youtube_agentql_scraper.py --url "https://www.youtube.com/watch?v=DqXVfRkY-WA" [--headless]
 13 | 
 14 | The script will print the title and description for the given video URL.
 15 | """
 16 | 
 17 | import argparse
 18 | import os
 19 | import textwrap
 20 | from dotenv import load_dotenv
 21 | import agentql
 22 | from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
 23 | 
 24 | load_dotenv()
 25 | 
 26 | 
 27 | def scrape_youtube(url: str, headless: bool = True) -> dict[str, str]:
 28 |     """Return the video's title and full description."""
 29 | 
 30 |     # 0. Configure AgentQL
 31 |     agentql.configure(api_key=os.getenv("AGENTQL_API_KEY", ""))
 32 | 
 33 |     with sync_playwright() as p:
 34 |         browser = p.chromium.launch(headless=headless)
 35 |         page = agentql.wrap(browser.new_page())
 36 | 
 37 |         # Navigate to the video URL and wait until the page is fully idle
 38 |         page.goto(url, wait_until="domcontentloaded")
 39 |         page.wait_for_page_ready_state()
 40 | 
 41 |         # 1. Accept cookies (EU banner)
 42 |         try:
 43 |             consent = page.query_elements(
 44 |                 """
 45 |                 {
 46 |                     accept_cookies_btn
 47 |                 }
 48 |                 """
 49 |             )
 50 |             consent.accept_cookies_btn.click(timeout=3000)
 51 |         except Exception:
 52 |             # Fallback for sites that use a different dialog text
 53 |             try:
 54 |                 page.locator("button:has-text('Accept all')").click(timeout=3000)
 55 |             except PlaywrightTimeoutError:
 56 |                 pass  # No consent dialog present
 57 | 
 58 |         # 2. Expand the description (click “Show more”)
 59 |         try:
 60 |             controls = page.query_elements(
 61 |                 """
 62 |                 {
 63 |                     expand_description_btn
 64 |                 }
 65 |                 """
 66 |             )
 67 |             controls.expand_description_btn.click(timeout=3000)
 68 |         except Exception:
 69 |             # Fallback selector if AgentQL can’t find the button
 70 |             try:
 71 |                 page.locator("tp-yt-paper-button:has-text('more')").click(timeout=3000)
 72 |             except PlaywrightTimeoutError:
 73 |                 pass
 74 | 
 75 |         # 3. Extract the title and the full description using AgentQL
 76 |         data = page.query_data(
 77 |             """
 78 |             {
 79 |                 video_title
 80 |                 description_text
 81 |             }
 82 |             """
 83 |         )
 84 | 
 85 |         browser.close()
 86 | 
 87 |         return {
 88 |             "title": data["video_title"],
 89 |             "description": textwrap.dedent(data["description_text"]).strip(),
 90 |         }
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     parser = argparse.ArgumentParser(
 95 |         description="Scrape YouTube title and description via Playwright + AgentQL."
 96 |     )
 97 |     parser.add_argument(
 98 |         "--url",
 99 |         required=True,
100 |         help="Full YouTube video URL",
101 |     )
102 |     parser.add_argument(
103 |         "--headless",
104 |         action="store_true",
105 |         help="Run browser in headless mode (default: GUI).",
106 |     )
107 |     args = parser.parse_args()
108 | 
109 |     result = scrape_youtube(args.url, headless=args.headless)
110 |     print("\n=== RESULT ===")
111 |     print("Title:", result["title"])
112 |     print("\nDescription:\n", result["description"])
113 | 


--------------------------------------------------------------------------------