├── .dockerignore ├── .gitignore ├── .python-version ├── Dockerfile ├── LICENSE ├── README.md ├── agent.py ├── agent_viz.py ├── baml_client ├── __init__.py ├── async_client.py ├── async_request.py ├── config.py ├── globals.py ├── inlinedbaml.py ├── parser.py ├── partial_types.py ├── sync_client.py ├── sync_request.py ├── tracing.py ├── type_builder.py └── types.py ├── baml_src ├── clients.baml ├── generators.baml ├── router.baml └── summarize.baml ├── bot.py ├── images ├── image.png └── system_arch.jpg ├── langgraph.json ├── pyproject.toml ├── scripts ├── deploy_cloud_run.sh ├── deploy_server.sh ├── run_docker.sh ├── run_local.sh └── setup_secrets.sh ├── tools ├── __init__.py ├── linkedin_agentql_scraper.py ├── pdf_handler.py ├── search.py ├── twitter_api_tool.py └── youtube_agentql_scraper.py └── uv.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | # Git files 2 | .git 3 | .gitignore 4 | 5 | # Python virtual environment 6 | .venv 7 | 8 | # Python cache 9 | __pycache__/ 10 | *.pyc 11 | *.pyo 12 | *.pyd 13 | 14 | # OS specific files 15 | .DS_Store 16 | 17 | # Secrets 18 | .env 19 | 20 | # We copy pyproject.toml and uv.lock now 21 | # uv.lock 22 | # pyproject.toml 23 | requirements.txt # No longer used by Dockerfile 24 | 25 | # Other build artifacts if any 26 | .idea/ 27 | *.egg-info/ 28 | dist/ 29 | build/ 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | .env 12 | __pycache__ 13 | 14 | .langgraph_api -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # # Dockerfile 2 | # --- 1. use Microsoft's pre-built image (has Chromium + all libs) 3 | FROM mcr.microsoft.com/playwright/python:v1.52.0-noble 4 | 5 | WORKDIR /app 6 | 7 | # --- 2. install uv 8 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /usr/local/bin/ 9 | 10 | # --- 3. copy dependency files 11 | COPY pyproject.toml uv.lock ./ 12 | 13 | # --- 4. install dependencies using uv 14 | RUN uv sync --frozen --no-cache 15 | 16 | # --- 5. copy code & launch 17 | COPY . . 18 | ENV PORT=8080 PYTHONUNBUFFERED=1 19 | CMD ["uv", "run", "uvicorn", "bot:app", "--host", "0.0.0.0", "--port", "8080"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Telegram Link Summarizer Agent 2 | 3 | [Join the Telegram Channel @tech_links for curated tech news and interesting links](https://t.me/tech_links) 4 | 5 | [![GitHub Repo stars](https://img.shields.io/github/stars/kargarisaac/telegram_link_summarizer_agent)](https://github.com/kargarisaac/telegram_link_summarizer_agent) 6 | [![GitHub forks](https://img.shields.io/github/forks/kargarisaac/telegram_link_summarizer_agent)](https://github.com/kargarisaac/telegram_link_summarizer_agent) 7 | [![GitHub License](https://img.shields.io/github/license/kargarisaac/telegram_link_summarizer_agent)](https://github.com/kargarisaac/telegram_link_summarizer_agent) 8 | [![Telegram Channel](https://img.shields.io/badge/Telegram-Join%20Channel-blue?logo=telegram)](https://t.me/tech_links) 9 | [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/kargarisaac/telegram_link_summarizer_agent) 10 | 11 | If you want to get the latest news and interesting links for free, feel free to join the channel. If you find this project useful, giving the repository a star would be appreciated. 12 | 13 | ![Agent Visualization](./images/image.png) 14 | 15 | ![System Diagram](./images/system_arch.jpg) 16 | 17 | 18 | An agentic Telegram bot designed to summarize web links (articles, papers, tweets etc.) sent in a chat. It uses LangGraph to orchestrate multiple tools and language models to determine the link type, extract content, and generate concise summaries. 19 | 20 | ## ✨ Features 21 | 22 | * **Link Summarization:** Extracts content from URLs (webpages, PDFs, Twitter/X, LinkedIn posts) and provides summaries. 23 | * **Robust YouTube Support:** Handles YouTube links using Playwright and AgentQL to extract video title and description. 24 | * **LLM Routing:** Uses a BAML LLM function (`RouteRequest`) to determine the type of link (Webpage, PDF, Twitter, LinkedIn, Unsupported). 25 | * **Web Search/Extraction:** Uses Tavily for standard webpage content extraction. 26 | * **PDF Support:** Can process and summarize PDF documents found at URLs. 27 | * **Twitter/X Support:** Fetches tweet content (including threads) using the `twitterapi.io` service. 28 | * **LinkedIn Support:** Extracts content from LinkedIn post URLs using Playwright and AgentQL. 29 | * **Agentic Workflow:** Leverages LangGraph for a multi-step reasoning process. 30 | * **BAML Integration:** Uses BAML for structured output generation (summaries and routing). 31 | * **Telegram Bot Interface:** Interacts via a simple Telegram bot, replying silently on failure. 32 | 33 | ## 🛠️ Tech Stack 34 | 35 | * **Routing/Summarization:** BAML (Boundary) + LLM (e.g., Gemini, Deepseek) 36 | * **Orchestration:** LangGraph 37 | * **YouTube Extraction:** `playwright`, `agentql` 38 | * **Twitter/X API:** `twitterapi.io` via `requests` 39 | * **Web Extraction:** Tavily Search SDK 40 | * **LinkedIn Extraction:** `playwright`, `agentql` 41 | * **PDF Extraction:** PyMuPDF (`fitz`) 42 | * **Telegram Bot:** `python-telegram-bot` 43 | * **Web Framework:** FastAPI + Uvicorn 44 | * **Dependencies:** Managed via `pyproject.toml` (using `uv` or `pip`) 45 | 46 | ## 🚀 Setup 47 | 48 | 1. **Clone the repository:** 49 | ```bash 50 | git clone 51 | cd telegram_link_summarizer_agent 52 | ``` 53 | 54 | 2. **Install Dependencies (using [`uv`](https://github.com/astral-sh/uv) or `pip`))** 55 | * You can use [`uv`](https://github.com/astral-sh/uv) or standard `pip`: 56 | ```bash 57 | # Using uv (recommended) 58 | uv pip install -e . # Install in editable mode 59 | 60 | # Or using pip 61 | pip install -e . # Install in editable mode 62 | ``` 63 | * Install Playwright browsers: 64 | ```bash 65 | playwright install 66 | ``` 67 | 68 | 3. **Set up Environment Variables:** 69 | Create a file named `.env` in the project root directory. Add the following environment variables with your actual values: 70 | ```env 71 | # --- Core API Keys --- 72 | # Select *one* LLM provider for BAML functions (or configure multi-provider 73 | # GEMINI_API_KEY="your_google_gemini_api_key" # For Google LLMs 74 | DEEPSEEK_API_KEY="your_deepseek_api_key" # For Deepseek LLMs 75 | GOOGLE_API_KEY="your_google_cloud_api_key" # e.g., For Google LLMs or other Google Cloud services 76 | 77 | # Tools 78 | TAVILY_API_KEY="your_tavily_api_key" 79 | TWITTER_API_IO_KEY="your_twitterapi.io_api_key" # API Key for twitterapi.io service 80 | AGENTQL_API_KEY="your_agentql_api_key" # API Key for AgentQL 81 | 82 | # --- Telegram Bot Configuration --- 83 | TELEGRAM_BOT_TOKEN="your_telegram_bot_token" 84 | 85 | # --- Webhook Configuration (Needed for deployment or local testing with ngrok) --- 86 | # For ngrok, use the https://.ngrok-free.app URL 87 | # For deployment, this isn't strictly needed in the .env for the *deployed* app, 88 | # but the deployment script will set the webhook based on the Cloud Run URL. 89 | # WEBHOOK_URL="your_webhook_url_or_ngrok_url" 90 | 91 | # Secure your webhook (generate strong random strings for these) 92 | TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_secret_token" 93 | # Example: /webhook/aBcDeF12345 - must start with a slash! 94 | # WEBHOOK_SECRET_PATH="/your_unique_and_random_webhook_path" 95 | 96 | # --- Polling vs Webhook Mode (for bot.py) --- 97 | # Set USE_POLLING to "true" to run the bot in polling mode (recommended for self-managed servers without HTTPS). 98 | # If USE_POLLING is "true", WEBHOOK_URL and related settings are ignored by bot.py. 99 | # Defaults to webhook mode if not set or "false". 100 | # USE_POLLING="true" 101 | 102 | # --- Webhook Configuration (Only if NOT using USE_POLLING="true") --- 103 | # For local ngrok testing: 104 | # WEBHOOK_URL="https://your-ngrok-subdomain.ngrok-free.app" 105 | # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py) 106 | 107 | # For self-managed server with public IP (HTTP, for testing - Telegram prefers HTTPS for production): 108 | # WEBHOOK_URL="http://YOUR_SERVER_IP:8080" 109 | # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py) 110 | 111 | # For self-managed server with domain and HTTPS (Production Webhook): 112 | # WEBHOOK_URL="https://yourbot.yourdomain.com" # Nginx would proxy to http://localhost:8080 113 | # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py) 114 | 115 | # For Google Cloud Run (this is typically set by the deploy_cloud_run.sh script, not manually in .env): 116 | # WEBHOOK_URL="your_cloud_run_service_url" 117 | # WEBHOOK_SECRET_PATH="your_unique_and_random_webhook_path" # e.g., webhook_abc123 (no leading slash for bot.py) 118 | 119 | # Secure your webhook (generate strong random strings for these) - ALWAYS NEEDED FOR WEBHOOK MODE 120 | # TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_secret_token" 121 | ``` 122 | **Important:** 123 | * Get your `TWITTER_API_IO_KEY` from [twitterapi.io](https://twitterapi.io/). 124 | * Ensure your chosen LLM API Key (`GEMINI_API_KEY` or `DEEPSEEK_API_KEY`) is uncommented and valid. 125 | * Keep your `.env` file secure and do not commit it. The `.gitignore` should exclude `.env`. 126 | 127 | ## ▶️ Usage 128 | 129 | 1. **(Optional) Run the Agent Script Directly (for testing specific URLs):** 130 | * You can test the core agent logic by running `agent.py`. Modify the test cases at the bottom of the script. 131 | ```bash 132 | python agent.py 133 | ``` 134 | 135 | ## 📊 Agent Visualization 136 | 137 | The `agent_viz.py` script can be used to generate a visualization of the LangGraph agent (like the image at the top). Ensure `graphviz` is installed (`brew install graphviz` or `sudo apt-get install graphviz`). 138 | 139 | ```bash 140 | python agent_viz.py 141 | ``` 142 | This will generate an `agent_graph.png` file. 143 | 144 | ## Local Running (Webhook Mode) 145 | 146 | This runs the FastAPI server using `uvicorn`. This requires `USE_POLLING="false"` and a publicly accessible `WEBHOOK_URL` set in your `.env` file (e.g., using ngrok) for the bot to receive messages from Telegram. 147 | 148 | Make sure you have installed dependencies (`uv pip install -e .`) and configured your `.env` file. 149 | 150 | ```bash 151 | # Make the script executable (only needed once) 152 | chmod +x ./scripts/run_local.sh 153 | 154 | # Run the local server 155 | ./scripts/run_local.sh 156 | ``` 157 | 158 | You can check if the server is running by accessing the health check endpoint: `curl http://localhost:8080/health` 159 | 160 | ## Testing Webhooks Locally with ngrok 161 | 162 | When running your bot locally, Telegram cannot reach your computer directly because `localhost` is not accessible from the public internet. To test real Telegram messages with webhooks during development, you can use [ngrok](https://ngrok.com/) to create a secure tunnel from a public URL to your local machine. 163 | 164 | ### Steps 165 | 166 | 1. **Install ngrok:** 167 | - Download from https://ngrok.com/download or install via your package manager. 168 | - On macOS (Homebrew): 169 | ```bash 170 | brew install ngrok 171 | ``` 172 | - On Linux: 173 | ```bash 174 | sudo snap install ngrok 175 | ``` 176 | - On Windows: Download and extract the executable from the website. 177 | 178 | 2. **Start your local server:** 179 | ```bash 180 | # Ensure the script is executable 181 | chmod +x ./scripts/run_local.sh 182 | # Run the local server 183 | ./scripts/run_local.sh 184 | ``` 185 | 186 | 3. **Start ngrok to expose port 8080:** 187 | ```bash 188 | ngrok http 8080 189 | ``` 190 | - You will see output like: 191 | ``` 192 | Forwarding https://abcd-1234.ngrok-free.app -> http://localhost:8080 193 | ``` 194 | - Copy the HTTPS URL provided by ngrok (e.g., `https://abcd-1234.ngrok-free.app`). 195 | 196 | 4. **Update your `.env` file:** 197 | - Set the `WEBHOOK_URL` to the ngrok HTTPS URL: 198 | ```env 199 | WEBHOOK_URL=https://your-ngrok-url.ngrok-free.app 200 | ``` 201 | - Save the file. 202 | 203 | 5. **Restart your local server:** 204 | - Stop the running `./scripts/run_local.sh` process (Ctrl+C) and start it again: 205 | ```bash 206 | ./scripts/run_local.sh 207 | ``` 208 | - On startup, the bot should attempt to register the webhook with Telegram using your public ngrok URL (if `bot.py` is configured to do so based on `WEBHOOK_URL`). 209 | 210 | 6. **Test your bot:** 211 | - Send a message with a link to your Telegram bot as usual. 212 | - Telegram will send the update to your ngrok public URL, which forwards it to your local server. 213 | - You should see logs in your terminal and receive a response from your local bot. 214 | 215 | **Tip:** If you restart ngrok, you will get a new public URL. Update your `.env` and restart the server each time. 216 | 217 | **Security Note:** For production, always use a secret path (`WEBHOOK_SECRET_PATH`) and a secret token (`TELEGRAM_WEBHOOK_SECRET_TOKEN`) for your webhook endpoint. For local ngrok testing, ensure these variables are also set in your `.env` if your `bot.py` requires them even locally. 218 | 219 | ## Docker Testing 220 | 221 | This builds the Docker image and runs the container locally. Ensure your `.env` file is present and configured in the project root. 222 | 223 | ```bash 224 | # Make the script executable (only needed once) 225 | chmod +x ./scripts/run_docker.sh 226 | 227 | # Build and run the Docker container 228 | ./scripts/run_docker.sh 229 | ``` 230 | 231 | You can check the health endpoint at `http://localhost:8080/health` 232 | 233 | ### Testing Docker Locally with ngrok 234 | 235 | You can also test the Docker container with ngrok to receive real Telegram messages: 236 | 237 | 1. **Run the Docker Container:** 238 | ```bash 239 | # Ensure script is executable 240 | chmod +x ./scripts/run_docker.sh 241 | # Build and run the container (loads .env) 242 | ./scripts/run_docker.sh 243 | ``` 244 | *(Leave this terminal running)* 245 | 246 | 2. **Start ngrok:** In a *new* terminal, run: 247 | ```bash 248 | ngrok http 8080 249 | ``` 250 | Copy the HTTPS URL provided by ngrok. 251 | 252 | 3. **Update `.env`:** Set the `WEBHOOK_URL` variable in your `.env` file to the ngrok HTTPS URL. 253 | 254 | 4. **Restart Docker Container:** Stop the running container (Ctrl+C in the first terminal, or `docker stop summarizer-bot`) and restart it using: 255 | ```bash 256 | ./scripts/run_docker.sh 257 | ``` 258 | This ensures the container picks up the new `WEBHOOK_URL` from the `.env` file. 259 | 260 | 5. **Test:** Send messages to your bot. They should be routed through ngrok to your running Docker container. 261 | 262 | ## Deploying to a Self-Managed Server/VM (Docker) 263 | 264 | This method uses Docker and the provided `scripts/deploy_server.sh` script to deploy the bot to your own virtual machine or dedicated server. This is the recommended approach for self-hosting. 265 | 266 | ### 1. Server Preparation 267 | 268 | SSH into your server and ensure `git` and `docker` are installed. 269 | 270 | ```bash 271 | # Update system (example for Debian/Ubuntu) 272 | sudo apt update && sudo apt upgrade -y 273 | 274 | # Install Git 275 | sudo apt install -y git 276 | 277 | # Install Docker 278 | sudo apt install -y docker.io 279 | sudo systemctl start docker 280 | sudo systemctl enable docker 281 | 282 | # Optional: Add your user to the docker group to run docker commands without sudo 283 | # sudo usermod -aG docker $USER 284 | # newgrp docker # Or log out and log back in 285 | ``` 286 | 287 | ### 2. Clone Repository 288 | 289 | Clone your repository onto the server: 290 | ```bash 291 | git clone 292 | cd telegram_link_summarizer_agent 293 | ``` 294 | 295 | ### 3. Configure Environment (`.env` file) 296 | 297 | Create a `.env` file in the project root on your server. 298 | 299 | **Option A: Polling Mode (Recommended for Simplicity)** 300 | This is the easiest way to get started on a self-managed server as it doesn't require a public domain, SSL, or complex firewall/proxy setup beyond allowing outbound connections. 301 | 302 | ```env 303 | # In your .env file on the server: 304 | USE_POLLING="true" 305 | 306 | # --- Core API Keys --- 307 | DEEPSEEK_API_KEY="your_deepseek_api_key" 308 | # GEMINI_API_KEY="your_google_gemini_api_key" 309 | TAVILY_API_KEY="your_tavily_api_key" 310 | TWITTER_API_IO_KEY="your_twitterapi.io_api_key" 311 | AGENTQL_API_KEY="your_agentql_api_key" 312 | 313 | # --- Telegram Bot Configuration --- 314 | TELEGRAM_BOT_TOKEN="your_telegram_bot_token" 315 | 316 | # --- Webhook related variables can be omitted or commented out when USE_POLLING="true" --- 317 | # WEBHOOK_URL= 318 | # WEBHOOK_SECRET_PATH= 319 | # TELEGRAM_WEBHOOK_SECRET_TOKEN= 320 | ``` 321 | 322 | **Option B: Webhook Mode** 323 | If you prefer webhook mode, you'll need a way for Telegram to reach your bot. 324 | 325 | * **Using Server IP (HTTP - for testing only, Telegram prefers HTTPS):** 326 | ```env 327 | # .env on server 328 | USE_POLLING="false" # Or omit 329 | WEBHOOK_URL="http://YOUR_SERVER_PUBLIC_IP:8080" 330 | WEBHOOK_SECRET_PATH="your_random_webhook_path_string" # e.g., webhook_bot123 (NO leading slash here) 331 | TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_token" 332 | # ... other API keys ... 333 | ``` 334 | You'll also need to ensure your server's firewall allows inbound traffic on port `8080`. 335 | ```bash 336 | sudo ufw allow 8080/tcp 337 | ``` 338 | 339 | * **Using a Domain Name (HTTPS - Recommended for Production Webhooks):** 340 | This involves setting up a domain name pointing to your server, using a reverse proxy like Nginx, and obtaining an SSL certificate (e.g., with Let's Encrypt). 341 | ```env 342 | # .env on server 343 | USE_POLLING="false" # Or omit 344 | WEBHOOK_URL="https://yourbot.yourdomain.com" # Nginx will handle HTTPS and proxy to the bot 345 | WEBHOOK_SECRET_PATH="your_random_webhook_path_string" 346 | TELEGRAM_WEBHOOK_SECRET_TOKEN="your_strong_random_token" 347 | # ... other API keys ... 348 | ``` 349 | Your Nginx would be configured to listen on port 443 (HTTPS), terminate SSL, and proxy requests for your `WEBHOOK_SECRET_PATH` to `http://localhost:8080`. Firewall should allow port 443. 350 | 351 | ### 4. Run Deployment Script 352 | 353 | The `deploy_server.sh` script will build the Docker image and start the container. 354 | ```bash 355 | chmod +x ./scripts/deploy_server.sh 356 | ./scripts/deploy_server.sh 357 | ``` 358 | The script uses port `8080` by default. 359 | 360 | ### 5. Monitoring 361 | 362 | * **View logs:** `docker logs -f telegram-summarizer` 363 | * **Check status:** `docker ps` 364 | * **Stop:** `docker stop telegram-summarizer` 365 | * **Start:** `docker start telegram-summarizer` 366 | * **Restart:** `docker restart telegram-summarizer` 367 | 368 | If using polling mode, the bot should start processing messages. If using webhook mode, ensure your webhook is correctly set with Telegram (the `deploy_server.sh` script attempts this if it finds webhook variables in `.env`, but `bot.py` also tries on startup). 369 | 370 | ## Deploying to Google Cloud Run 371 | 372 | This guide assumes you have a GCP account, `gcloud` CLI installed and configured, and Docker installed. 373 | 374 | The deployment process involves: 375 | 1. **Setting up Secrets:** Securely store your API keys and tokens in Google Cloud Secret Manager. 376 | 2. **Building & Pushing Image:** Build the Docker image and push it to Google Artifact Registry. 377 | 3. **Deploying Service:** Deploy the image to Cloud Run, mapping the secrets to environment variables. 378 | 4. **Setting Webhook:** Configure the Telegram webhook to point to your Cloud Run service URL. 379 | 380 | We have provided scripts to streamline this process. 381 | 382 | ### 1. Setup Secrets 383 | 384 | This script helps you create secrets in Google Cloud Secret Manager and add your sensitive values (API keys, tokens). 385 | 386 | **IMPORTANT:** Before running, you **must** edit the `SECRETS` array inside `scripts/setup_secrets.sh` to include the *exact names* of the environment variables defined in your `.env` file (e.g., `TELEGRAM_BOT_TOKEN`, `TAVILY_API_KEY`, `TWITTER_API_IO_KEY`, `GEMINI_API_KEY` etc.). 387 | 388 | ```bash 389 | # Make the script executable (only needed once) 390 | chmod +x ./scripts/setup_secrets.sh 391 | 392 | # Run the secret setup script (it will prompt for project ID and secret values) 393 | ./scripts/setup_secrets.sh 394 | ``` 395 | 396 | Follow the prompts to enter your GCP Project ID (if not already configured) and the values for each secret. 397 | 398 | ### 2. Deploy to Cloud Run 399 | 400 | This script automates building the image, pushing it to Artifact Registry, deploying to Cloud Run, and setting the Telegram webhook. 401 | 402 | **IMPORTANT:** Before running, you **must** edit the `SECRETS_TO_MAP` array inside `scripts/deploy_cloud_run.sh`. This array defines how the secrets you created map to environment variables in your Cloud Run service. Ensure the secret names match those used in `setup_secrets.sh` (e.g., `TWITTER_API_IO_KEY=twitter-api-io-key-secret-name:latest`). 403 | 404 | ```bash 405 | # Make the script executable (only needed once) 406 | chmod +x ./scripts/deploy_cloud_run.sh 407 | 408 | # Run the deployment script (it will prompt for configuration) 409 | ./scripts/deploy_cloud_run.sh 410 | ``` 411 | 412 | The script will prompt you for your GCP Project ID, Region, Service Name, and Artifact Registry Repository Name if they are not set as environment variables. It will then guide you through the build, push, and deployment process, including setting the Telegram webhook automatically if it can find your `TELEGRAM_BOT_TOKEN` secret mapping. 413 | 414 | ### Manual Steps (If needed) 415 | 416 |
417 | Click to view manual gcloud commands 418 | 419 | 1. **Set Environment Variables (Shell):** 420 | ```bash 421 | export PROJECT_ID="your-gcp-project-id" 422 | export REGION="your-preferred-region" # e.g., us-central1 423 | export SERVICE_NAME="telegram-summarizer" 424 | export REPO_NAME="my-summarizer-bot-repo" # Or your preferred Artifact Registry repo name 425 | export IMAGE_NAME="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO_NAME}/${SERVICE_NAME}:latest" 426 | 427 | gcloud config set project $PROJECT_ID 428 | gcloud config set run/region $REGION 429 | ``` 430 | 2. **Enable Required APIs:** 431 | ```bash 432 | gcloud services enable run.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com secretmanager.googleapis.com 433 | ``` 434 | 3. **Create Artifact Registry Repository (if needed):** 435 | ```bash 436 | gcloud artifacts repositories create $REPO_NAME \ 437 | --repository-format=docker \ 438 | --location=$REGION \ 439 | --description="Docker repository for bots" 440 | ``` 441 | 4. **Configure Docker Authentication:** 442 | ```bash 443 | gcloud auth configure-docker ${REGION}-docker.pkg.dev 444 | ```5. **Manage Secrets with Secret Manager (Recommended):** 445 | Store API keys and tokens securely using Google Cloud Secret Manager. Use the `gcloud` CLI (as done by `setup_secrets.sh`): 446 | 447 | * **Create Secret:** (Example: `twitter-api-io-key` for the twitterapi.io key) 448 | ```bash 449 | gcloud secrets create twitter-api-io-key --replication-policy="automatic" 450 | # Add others like tavily-api-key, telegram-bot-token, gemini-api-key, etc. 451 | ``` 452 | 453 | * **Add Secret Version:** 454 | ```bash 455 | printf "YOUR_ACTUAL_TWITTERAPI_IO_KEY" | gcloud secrets versions add twitter-api-io-key --data-file=- 456 | # Add versions for other secrets... 457 | ``` 458 | 459 | 6. **Build and Push Docker Image:** 460 | ```bash 461 | # Build 462 | docker build -t $IMAGE_NAME . 463 | # Push 464 | docker push $IMAGE_NAME 465 | ``` 466 | 467 | 7. **Deploy to Cloud Run:** 468 | Replace `SECRET_NAME=SECRET_ID:latest,...` with your actual secret mappings, including `TWITTER_API_IO_KEY`. 469 | ```bash 470 | gcloud run deploy $SERVICE_NAME \ 471 | --image $IMAGE_NAME \ 472 | --platform managed \ 473 | --region $REGION \ 474 | --port 8080 \ 475 | --allow-unauthenticated \ 476 | --set-secrets=TELEGRAM_BOT_TOKEN=telegram-bot-token:latest,TAVILY_API_KEY=tavily-api-key:latest,GEMINI_API_KEY=gemini-api-key:latest,TWITTER_API_IO_KEY=twitter-api-io-key:latest,TELEGRAM_WEBHOOK_SECRET_TOKEN=webhook-secret-token:latest 477 | # Adjust secret names (e.g., twitter-api-io-key, webhook-secret-token) and versions as needed 478 | ``` 479 | 480 | 8. **Get Service URL & Set Telegram Webhook:** 481 | ```bash 482 | # Get the URL 483 | SERVICE_URL=$(gcloud run services describe $SERVICE_NAME --platform managed --region $REGION --format 'value(status.url)') 484 | echo "Service URL: $SERVICE_URL" 485 | 486 | # Get your bot token (replace secret-id if different) 487 | TELEGRAM_BOT_TOKEN=$(gcloud secrets versions access latest --secret=telegram-bot-token) 488 | # Get your webhook secret (optional, replace secret-id if different) 489 | WEBHOOK_SECRET=$(gcloud secrets versions access latest --secret=telegram-webhook-secret-token) 490 | 491 | # Get your webhook path (replace secret-id if different) 492 | WEBHOOK_SECRET_PATH_VAL=$(gcloud secrets versions access latest --secret=webhook-secret-path) # Assuming you stored it 493 | # Get your webhook secret token (replace secret-id if different) 494 | WEBHOOK_SECRET_TOKEN_VAL=$(gcloud secrets versions access latest --secret=webhook-secret-token) 495 | 496 | curl -F "url=${SERVICE_URL}${WEBHOOK_SECRET_PATH_VAL}" \ 497 | -F "secret_token=${WEBHOOK_SECRET_TOKEN_VAL}" \ 498 | https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook 499 | ``` 500 | 501 |
502 | 503 | ## License 504 | 505 | This project is licensed under the Apache License 2.0. See the [LICENSE](LICENSE) file for details. 506 | 507 | --- 508 | 509 | **Stay updated:** [Join the Telegram Channel @tech_links](https://t.me/tech_links) for the latest news and interesting links. If you find this project useful, please consider starring the repository. 510 | 511 | ## Star History 512 | 513 | [![Star History Chart](https://api.star-history.com/svg?repos=kargarisaac/telegram_link_summarizer_agent&type=Date)](https://www.star-history.com/#kargarisaac/telegram_link_summarizer_agent&Date) -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import logging # Added for youtube scraper logging visibility 4 | from typing import Any, Dict, TypedDict, Union, Optional 5 | 6 | from baml_client import b 7 | from baml_client.types import ContentType, Summary, ExtractorTool 8 | from dotenv import load_dotenv 9 | 10 | from langgraph.graph import StateGraph, END 11 | from rich.console import Console 12 | from tools.search import run_tavily_tool 13 | from tools.pdf_handler import get_pdf_text 14 | from tools.twitter_api_tool import fetch_tweet_thread 15 | from tools.linkedin_agentql_scraper import ( 16 | scrape_linkedin_post as scrape_linkedin_post_agentql, 17 | ) 18 | from tools.youtube_agentql_scraper import scrape_youtube as scrape_youtube_agentql 19 | 20 | load_dotenv() 21 | 22 | console = Console() 23 | 24 | # Configure logging slightly for better visibility from tools 25 | logging.basicConfig( 26 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s" 27 | ) 28 | logging.getLogger("httpx").setLevel(logging.WARNING) # Reduce noise from http libraries 29 | logging.getLogger("googleapiclient.discovery_cache").setLevel(logging.ERROR) 30 | 31 | # --- LangGraph Agent State --- 32 | 33 | 34 | class AgentState(TypedDict): 35 | original_message: str 36 | url: str 37 | content_type: ContentType # 'web' or 'pdf' or others? 38 | content: str 39 | summary: str 40 | error: Optional[str] 41 | route_decision: Optional[str] # To store the routing decision string 42 | needs_web_fallback: bool # Flag for YouTube fallback 43 | 44 | 45 | # --- Define Graph Nodes --- 46 | 47 | 48 | def init_state(state: AgentState) -> Dict[str, Any]: 49 | """Extracts the URL from the original message.""" 50 | console.print("---INIT STATE---", style="yellow bold") 51 | message = state["original_message"] 52 | # Basic URL extraction (consider a more robust regex) 53 | url = next((word for word in message.split() if word.startswith("http")), None) 54 | error = None if url else "No URL found in the message." 55 | 56 | if error: 57 | console.print(f"Initialization error: {error}", style="red") 58 | 59 | return { 60 | "original_message": message, 61 | "url": url if url else "", # Ensure url is always a string 62 | "content_type": ContentType.Webpage, # Default, gets updated later 63 | "content": "", 64 | "summary": "", 65 | "error": error, 66 | "route_decision": None, 67 | "needs_web_fallback": False, # Initialize flag 68 | } 69 | 70 | 71 | async def llm_router(state: AgentState) -> Dict[str, Any]: 72 | """Determines the content extraction route using the BAML LLM Router.""" 73 | console.print("---LLM ROUTER (BAML)--- ", style="yellow bold") 74 | 75 | # If init failed, pass the error along 76 | if state.get("error"): 77 | console.print( 78 | f"Skipping LLM Router due to init error: {state['error']}", style="red" 79 | ) 80 | return {"error": state["error"], "route_decision": "__error__"} 81 | 82 | message = state["original_message"] 83 | decision = "__error__" # Default to error 84 | routing_error = None 85 | 86 | try: 87 | console.print( 88 | f"Calling BAML RouteRequest for: '{message[:50]}...'", style="cyan" 89 | ) 90 | # Call the BAML function (synchronously, as it's not declared async in BAML) 91 | route_result: ExtractorTool = b.RouteRequest(original_message=message) 92 | 93 | console.print(f"LLM Router returned: {route_result}", style="green") 94 | 95 | # Map the enum result to string for routing 96 | if route_result == ExtractorTool.WebpageExtractor: 97 | decision = "web_extractor" 98 | elif route_result == ExtractorTool.PDFExtractor: 99 | decision = "pdf_extractor" 100 | elif route_result == ExtractorTool.TwitterExtractor: 101 | decision = "twitter_extractor" 102 | elif route_result == ExtractorTool.LinkedInExtractor: 103 | decision = "linkedin_extractor" 104 | elif route_result == ExtractorTool.YoutubeExtractor: 105 | decision = "youtube_extractor" # Added Youtube route 106 | elif route_result == ExtractorTool.Unsupported: 107 | decision = "__unsupported__" 108 | routing_error = "Unsupported URL type or no URL found by LLM Router." 109 | console.print(routing_error, style="yellow") 110 | else: 111 | # Should not happen if enum is handled correctly 112 | decision = "__error__" 113 | routing_error = f"LLM Router returned an unexpected value: {route_result}" 114 | console.print(routing_error, style="red") 115 | 116 | except Exception as e: 117 | console.print(f"Error calling BAML RouteRequest: {e}", style="red bold") 118 | routing_error = f"LLM Router failed: {e}" 119 | decision = "__error__" 120 | 121 | # Update the state dictionary 122 | return { 123 | "route_decision": decision, 124 | "error": routing_error, # Overwrite previous error state if routing fails 125 | } 126 | 127 | 128 | def get_web_content(state: AgentState) -> Dict[str, Any]: 129 | """Fetches content from a standard webpage URL using Tavily extract.""" 130 | console.print("---GET WEB CONTENT (Tavily Extract)--- ", style="yellow bold") 131 | url = state["url"] 132 | error_message = None 133 | content_source = "" 134 | content_type = ContentType.Webpage 135 | 136 | # Reset error from previous steps if any 137 | state["error"] = None 138 | # Reset fallback flag if we reached here directly or as fallback 139 | state["needs_web_fallback"] = False 140 | 141 | try: 142 | # Use Tavily extract for non-Twitter URLs 143 | console.print(f"Using Tavily extract for: {url}", style="cyan") 144 | extract_tool_results = run_tavily_tool(mode="extract", urls=[url]) 145 | results_list = extract_tool_results.get("results", []) 146 | failed_results = extract_tool_results.get("failed_results", []) 147 | 148 | if results_list: 149 | for res in results_list: 150 | # Try to get 'raw_content' first, fallback to 'content' 151 | raw_content = res.get("raw_content") 152 | if not raw_content: 153 | raw_content = res.get( 154 | "content", "" 155 | ) # Fallback if raw_content is missing 156 | 157 | if raw_content: # Only add if content exists 158 | content_source += f"URL: {res.get('url', 'N/A')}\n" 159 | content_source += f"Raw Content: {raw_content}\n\n" 160 | # Optional: Include images if needed later 161 | # content_source += f"Images: {res.get('images', [])}\n" 162 | 163 | if failed_results: 164 | error_message = ( 165 | f"Tavily failed to extract content from: {', '.join(failed_results)}" 166 | ) 167 | console.print(error_message, style="red") 168 | # If extraction failed entirely and we have no content, set content_source empty 169 | if not content_source: 170 | content_source = "" 171 | 172 | # If after trying extract, we still have no content and no specific error, set a generic one 173 | if not content_source and not error_message: 174 | error_message = "Tavily extract did not return any content for the URL." 175 | console.print(error_message, style="red") 176 | 177 | except Exception as e: 178 | console.print(f"Error getting content from URL {url}: {e}", style="red bold") 179 | error_message = f"Error: An unexpected error occurred while getting content from the URL. {e}" 180 | content_source = "" # Ensure content is empty on error 181 | 182 | return { 183 | # **state, # Don't spread the entire state, just update relevant fields 184 | "content_type": content_type, 185 | "content": content_source.strip(), # Strip leading/trailing whitespace 186 | "error": error_message, 187 | "needs_web_fallback": False, # Explicitly set to false after web extraction 188 | } 189 | 190 | 191 | def get_twitter_content(state: AgentState) -> Dict[str, Any]: 192 | """Fetches content from a Twitter/X URL using twitter_api_tool.""" 193 | console.print("---GET TWITTER/X CONTENT (twitterapi.io)--- ", style="yellow bold") 194 | url = state["url"] 195 | error_message = None 196 | content_result = "" 197 | content_type = ContentType.Webpage 198 | 199 | # Reset error from previous steps if any 200 | state["error"] = None 201 | state["needs_web_fallback"] = False # Reset flag 202 | 203 | try: 204 | console.print(f"Fetching tweet thread for URL: {url}", style="cyan") 205 | # Use the new tool 206 | content_result = fetch_tweet_thread(url) 207 | 208 | # Check if the tool returned an error message 209 | if isinstance(content_result, str) and content_result.startswith("Error:"): 210 | error_message = content_result 211 | console.print(error_message, style="red bold") 212 | content_result = "" # Ensure content is empty if tool errored 213 | elif not content_result: # Handle empty success case 214 | error_message = "Twitter tool returned no content." 215 | console.print(error_message, style="yellow") 216 | content_result = "" 217 | else: 218 | console.print( 219 | f"Successfully fetched Twitter content for: {url}", style="green" 220 | ) 221 | # Ensure content_result is a string 222 | if not isinstance(content_result, str): 223 | content_result = str(content_result) 224 | 225 | except Exception as e: 226 | console.print( 227 | f"Unexpected error calling fetch_tweet_thread for {url}: {e}", 228 | style="red bold", 229 | ) 230 | error_message = ( 231 | f"Error: An unexpected error occurred while calling the Twitter tool. {e}" 232 | ) 233 | content_result = "" 234 | 235 | return { 236 | # **state, 237 | "content_type": content_type, 238 | "content": content_result.strip(), 239 | "error": error_message, 240 | "needs_web_fallback": False, 241 | } 242 | 243 | 244 | def get_linkedin_content(state: AgentState) -> Dict[str, Any]: 245 | """Fetches content from a LinkedIn post URL using linkedin_scraper_tool.""" 246 | console.print( 247 | "---GET LINKEDIN CONTENT (linkedin_scraper_tool)--- ", style="yellow bold" 248 | ) 249 | url = state["url"] 250 | error_message = None 251 | content_result = "" 252 | content_type = ( 253 | ContentType.Webpage 254 | ) # LinkedIn posts are treated as webpages for summarization 255 | 256 | # Reset error from previous steps if any 257 | state["error"] = None 258 | state["needs_web_fallback"] = False # Reset flag 259 | 260 | try: 261 | console.print(f"Fetching LinkedIn post content for URL: {url}", style="cyan") 262 | # Use the LinkedIn tool (AgentQL version) 263 | result = scrape_linkedin_post_agentql( 264 | url, headless=True 265 | ) # Call with headless=True 266 | 267 | # AgentQL scraper returns a dict: {"author": "...", "content": "..."} 268 | if isinstance(result, dict) and result.get("content"): 269 | content_result = result["content"] 270 | # author = result.get("author") # Author is available if needed later 271 | console.print( 272 | f"Successfully fetched LinkedIn content (AgentQL) for: {url}", 273 | style="green", 274 | ) 275 | if not isinstance(content_result, str): 276 | content_result = str(content_result) 277 | elif ( 278 | isinstance(result, dict) and "error" in result 279 | ): # Check for an error key if scraper returns errors that way 280 | error_message = ( 281 | f"LinkedIn AgentQL scraper returned an error: {result['error']}" 282 | ) 283 | console.print(error_message, style="red bold") 284 | content_result = "" 285 | else: 286 | error_message = f"LinkedIn AgentQL scraper returned unexpected result or no content: {result}" 287 | console.print(error_message, style="yellow") 288 | content_result = "" 289 | 290 | except Exception as e: 291 | console.print( 292 | f"Unexpected error calling scrape_linkedin_post for {url}: {e}", 293 | style="red bold", 294 | ) 295 | error_message = ( 296 | f"Error: An unexpected error occurred while calling the LinkedIn tool. {e}" 297 | ) 298 | content_result = "" 299 | 300 | return { 301 | # **state, 302 | "content_type": content_type, 303 | "content": content_result.strip(), 304 | "error": error_message, 305 | "needs_web_fallback": False, 306 | } 307 | 308 | 309 | def get_youtube_content(state: AgentState) -> Dict[str, Any]: 310 | """Fetches content (description/transcript) using youtube_scraper with fallbacks.""" 311 | console.print( 312 | "---GET YOUTUBE CONTENT (yt-dlp + Fallbacks)--- ", style="yellow bold" 313 | ) 314 | url = state["url"] 315 | error_message = None 316 | content_result = "" 317 | # For YouTube, let's treat the content type as Webpage for the summarizer initially 318 | content_type = ContentType.Webpage 319 | 320 | # Reset error and fallback flag from previous steps if any 321 | state["error"] = None 322 | # No longer using needs_web_fallback with AgentQL direct approach 323 | # state["needs_web_fallback"] = False 324 | 325 | try: 326 | console.print(f"Fetching YouTube info for URL (AgentQL): {url}", style="cyan") 327 | # Use the YouTube AgentQL tool 328 | result = scrape_youtube_agentql(url, headless=True) # Call with headless=True 329 | 330 | # AgentQL scraper returns: {"title": "...", "description": "..."} 331 | if isinstance(result, dict) and ( 332 | result.get("title") or result.get("description") 333 | ): 334 | title = result.get("title", "") 335 | description = result.get("description", "") 336 | content_result = f"Title: {title}\n\nDescription:\n{description}".strip() 337 | console.print( 338 | f"Successfully fetched YouTube content (AgentQL) for: {url}", 339 | style="green", 340 | ) 341 | error_message = None 342 | elif ( 343 | isinstance(result, dict) and "error" in result 344 | ): # If scraper returns dict with error 345 | error_message = ( 346 | f"YouTube AgentQL scraper returned an error: {result['error']}" 347 | ) 348 | console.print(error_message, style="red bold") 349 | content_result = "" 350 | else: 351 | error_message = f"YouTube AgentQL scraper returned unexpected result or no content: {result}" 352 | console.print(error_message, style="yellow") 353 | content_result = "" 354 | 355 | except Exception as e: 356 | console.print( 357 | f"Unexpected error calling scrape_youtube_agentql for {url}: {e}", 358 | style="red bold", 359 | ) 360 | error_message = f"Error: An unexpected error occurred while calling the YouTube AgentQL tool. {e}" 361 | content_result = "" 362 | # needs_fallback = False # Not used anymore 363 | 364 | return { 365 | # **state, 366 | "content_type": content_type, 367 | "content": content_result.strip(), 368 | "error": error_message, # Will be None if successful 369 | "needs_web_fallback": False, # Explicitly set to false, not used for fallback anymore 370 | } 371 | 372 | 373 | def handle_pdf_content(state: AgentState) -> Dict[str, Any]: 374 | """Downloads and extracts text from a PDF URL.""" 375 | console.print("---HANDLE PDF CONTENT--- ", style="bold yellow") 376 | url = state["url"] 377 | error_message = None 378 | pdf_text = "" 379 | 380 | # Reset error from previous steps if any 381 | state["error"] = None 382 | state["needs_web_fallback"] = False # Reset flag 383 | 384 | try: 385 | extracted_text = get_pdf_text(url) 386 | if isinstance(extracted_text, str) and extracted_text.startswith("Error:"): 387 | console.print( 388 | f"Error getting PDF content: {extracted_text}", style="red bold" 389 | ) 390 | error_message = extracted_text 391 | elif not extracted_text: 392 | error_message = "PDF extraction returned no text." 393 | console.print(error_message, style="yellow") 394 | else: 395 | console.print( 396 | f"Successfully extracted text from PDF: {url}", style="magenta" 397 | ) 398 | pdf_text = extracted_text 399 | # Ensure text is string 400 | if not isinstance(pdf_text, str): 401 | pdf_text = str(pdf_text) 402 | 403 | except Exception as e: 404 | console.print(f"Unexpected error handling PDF {url}: {e}", style="red bold") 405 | error_message = ( 406 | f"Error: An unexpected error occurred while processing the PDF. {e}" 407 | ) 408 | 409 | return { 410 | # **state, 411 | "content": pdf_text.strip(), 412 | "content_type": ContentType.PDF, 413 | "error": error_message, 414 | "needs_web_fallback": False, 415 | } 416 | 417 | 418 | async def summarize_content(state: AgentState) -> Dict[str, Any]: 419 | """Summarizes the extracted content using BAML.""" 420 | console.print("---SUMMARIZE CONTENT--- ", style="bold green") 421 | 422 | content_to_summarize = state.get("content") 423 | 424 | # If there was an error *before* summarization, don't proceed 425 | if state.get("error"): 426 | console.print( 427 | f"Skipping summarization due to previous error: {state['error']}", 428 | style="yellow", 429 | ) 430 | return {"summary": "", "error": state["error"]} # Keep existing error 431 | 432 | if not content_to_summarize or content_to_summarize.strip() == "": 433 | console.print("No content available to summarize.", style="yellow") 434 | # If we reached here due to an upstream error, preserve it 435 | # Otherwise, set an error indicating no content. 436 | final_error = state.get("error") or "No content found to summarize." 437 | return { 438 | "summary": "", 439 | "error": final_error, 440 | } 441 | 442 | url = state.get("url", "Unknown URL") 443 | summarization_error = None 444 | formatted_summary = "" 445 | 446 | try: 447 | console.print( 448 | f"--- Debug: Summarizing {len(content_to_summarize)} chars --- ", 449 | style="dim", 450 | ) 451 | # Ensure content_type is valid, default to Webpage if missing/invalid 452 | content_type = state.get("content_type", ContentType.Webpage) 453 | if not isinstance(content_type, ContentType): 454 | content_type = ContentType.Webpage # Default fallback 455 | 456 | # Call the BAML function (assuming it's synchronous based on definition) 457 | summary_result: Summary = b.SummarizeContent( 458 | content=content_to_summarize, 459 | content_type=content_type, 460 | context=state.get("original_message", ""), 461 | ) 462 | console.print(f"Successfully generated summary.", style="bold green") 463 | title = getattr(summary_result, "title", "Summary") # Default title 464 | key_points = getattr(summary_result, "key_points", []) 465 | concise_summary = getattr( 466 | summary_result, 467 | "concise_summary", 468 | "Summarization service returned an unexpected response format.", 469 | ) 470 | 471 | # Ensure parts are strings 472 | title = str(title) if title else "Summary" 473 | key_points = [str(p).strip() for p in key_points if p] 474 | concise_summary = ( 475 | str(concise_summary).strip() if concise_summary else "No summary generated." 476 | ) 477 | 478 | formatted_summary = f"# {title}\n\n" 479 | if key_points: 480 | formatted_summary += "## Key Points:\n" 481 | for point in key_points: 482 | formatted_summary += f"- {point}\n" 483 | formatted_summary += "\n" # Add space before summary 484 | formatted_summary += f"## Summary:\n{concise_summary}" 485 | formatted_summary = re.sub(r"\n\s*\n", "\n\n", formatted_summary).strip() 486 | 487 | # Clear any previous error if summarization succeeds 488 | summarization_error = None 489 | 490 | except Exception as e: 491 | console.print(f"Error during summarization for {url}: {e}", style="red bold") 492 | print(f"--- Debug: BAML summarization error: {e} ---") 493 | summarization_error = f"Summarization failed: {e}" 494 | formatted_summary = "" # Ensure summary is empty on error 495 | 496 | # Return only summary and error, let graph manage state merge 497 | return { 498 | "summary": formatted_summary, 499 | "error": summarization_error, # Overwrite previous errors only if summarization fails 500 | } 501 | 502 | 503 | # --- Conditional Edges Logic --- 504 | 505 | 506 | def route_based_on_llm(state: AgentState) -> str: 507 | """Routes to the appropriate extractor based on the LLM router decision.""" 508 | console.print("---ROUTING (LLM Decision)--- ", style="yellow bold") 509 | decision = state.get("route_decision") 510 | error = state.get("error") # Check for errors from init or router node 511 | 512 | if error: 513 | console.print(f"Routing to END due to error: {error}", style="red") 514 | return END 515 | 516 | if decision == "web_extractor": 517 | console.print(f"LLM Routed to: Web Extractor", style="magenta") 518 | return "web_extractor" 519 | elif decision == "pdf_extractor": 520 | console.print(f"LLM Routed to: PDF Extractor", style="magenta") 521 | return "pdf_extractor" 522 | elif decision == "twitter_extractor": 523 | console.print(f"LLM Routed to: Twitter Extractor", style="magenta") 524 | return "twitter_extractor" 525 | elif decision == "linkedin_extractor": 526 | console.print(f"LLM Routed to: LinkedIn Extractor", style="magenta") 527 | return "linkedin_extractor" 528 | elif decision == "youtube_extractor": 529 | console.print(f"LLM Routed to: YouTube Extractor", style="magenta") 530 | return "youtube_extractor" # Added Youtube route 531 | elif decision == "__unsupported__": 532 | console.print("LLM Routed to: Unsupported -> END", style="yellow") 533 | # Error message should already be set by the router node 534 | return END 535 | else: # Includes __error__ or unexpected values 536 | console.print( 537 | f"LLM Routing decision invalid or error ('{decision}'). Routing to END.", 538 | style="red", 539 | ) 540 | # Ensure error state reflects this if not already set 541 | current_error = state.get("error") 542 | if not current_error: 543 | # Update state directly is tricky in conditional functions. 544 | # Ideally, the router node should set the error if decision is __error__. 545 | # For now, just log and route to end. 546 | console.print( 547 | f"Setting error state due to invalid routing: {decision}", style="red" 548 | ) 549 | # state["error"] = f"Invalid routing decision: {decision}" 550 | return END 551 | 552 | 553 | def should_summarize(state: AgentState) -> str: 554 | """Determines whether to proceed to summarization or end after extraction.""" 555 | content = state.get("content") 556 | error = state.get("error") # Check error from the *extractor* node 557 | has_content = content and isinstance(content, str) and content.strip() != "" 558 | # needs_fallback is no longer used for YouTube -> Webpage fallback 559 | # if needs_fallback: 560 | # console.print( 561 | # "Routing after Extraction: YouTube fallback failed, routing to Web Extractor.", 562 | # style="yellow", 563 | # ) 564 | # return "web_extractor" # Route to web extractor as the last resort 565 | 566 | if error: 567 | console.print( 568 | f"Routing after Extraction: Error occurred ('{error}'), routing to END.", 569 | style="red", 570 | ) 571 | return END 572 | elif has_content: 573 | console.print( 574 | "Routing after Extraction: Content extracted successfully, routing to Summarize.", 575 | style="green", 576 | ) 577 | return "summarize_content" 578 | else: 579 | console.print( 580 | "Routing after Extraction: No content extracted and no specific error, routing to END.", 581 | style="yellow", 582 | ) 583 | # Set an error if none exists from the extractor 584 | current_error = state.get("error") 585 | final_error = current_error or "Content extraction finished with no content." 586 | # state["error"] = final_error # Avoid direct state modification here 587 | console.print(f"Setting error state: {final_error}", style="yellow") 588 | # How to set error state correctly before END? 589 | # LangGraph merges the partial state returned by the node *after* the edge logic. 590 | # We might need an explicit error handling node. 591 | # For now, just route to END. The final state check should catch the lack of summary. 592 | return END 593 | 594 | 595 | # --- Build the Graph --- 596 | 597 | 598 | def build_graph(): 599 | workflow = StateGraph(AgentState) 600 | 601 | # Add nodes 602 | workflow.add_node("init", init_state) 603 | workflow.add_node("llm_router", llm_router) # New router node 604 | workflow.add_node("web_extractor", get_web_content) 605 | workflow.add_node("pdf_extractor", handle_pdf_content) 606 | workflow.add_node("twitter_extractor", get_twitter_content) 607 | workflow.add_node("linkedin_extractor", get_linkedin_content) 608 | workflow.add_node("youtube_extractor", get_youtube_content) # Add new node 609 | workflow.add_node("summarize_content", summarize_content) 610 | 611 | # Define edges 612 | workflow.set_entry_point("init") 613 | 614 | # Edge from init to the LLM router 615 | workflow.add_edge("init", "llm_router") 616 | 617 | # Conditional routing based on LLM Router output 618 | workflow.add_conditional_edges( 619 | "llm_router", 620 | route_based_on_llm, 621 | { 622 | "web_extractor": "web_extractor", 623 | "pdf_extractor": "pdf_extractor", 624 | "twitter_extractor": "twitter_extractor", 625 | "linkedin_extractor": "linkedin_extractor", 626 | "youtube_extractor": "youtube_extractor", # Add edge to new node 627 | END: END, # Handles errors and unsupported cases from the router 628 | }, 629 | ) 630 | 631 | # Route from each extractor to the summarization check 632 | # Note: The should_summarize function now handles routing to web_extractor for YouTube fallback 633 | workflow.add_conditional_edges( 634 | "web_extractor", 635 | should_summarize, 636 | { 637 | "summarize_content": "summarize_content", 638 | END: END, 639 | # No web_extractor fallback from web_extractor itself 640 | }, 641 | ) 642 | workflow.add_conditional_edges( 643 | "pdf_extractor", 644 | should_summarize, 645 | { 646 | "summarize_content": "summarize_content", 647 | END: END, 648 | # No web_extractor fallback needed from pdf 649 | }, 650 | ) 651 | workflow.add_conditional_edges( 652 | "twitter_extractor", 653 | should_summarize, 654 | { 655 | "summarize_content": "summarize_content", 656 | END: END, 657 | # No web_extractor fallback needed from twitter 658 | }, 659 | ) 660 | workflow.add_conditional_edges( 661 | "linkedin_extractor", 662 | should_summarize, 663 | { 664 | "summarize_content": "summarize_content", 665 | END: END, 666 | # No web_extractor fallback needed from linkedin 667 | }, 668 | ) 669 | workflow.add_conditional_edges( 670 | "youtube_extractor", # Edges from YouTube extractor 671 | should_summarize, # Use the same logic function, now enhanced 672 | { 673 | "summarize_content": "summarize_content", 674 | END: END, 675 | }, 676 | ) 677 | 678 | # Summarizer always goes to end 679 | workflow.add_edge("summarize_content", END) 680 | 681 | return workflow.compile() 682 | 683 | 684 | graph = build_graph() 685 | 686 | # --- Main Agent Function --- 687 | 688 | 689 | async def run_agent(message: str) -> Union[str, None]: 690 | """ 691 | Runs the LangGraph agent workflow for URL summarization using an LLM router. 692 | 693 | Args: 694 | message: The original message potentially containing a URL. 695 | 696 | Returns: 697 | - str: Summary text on successful extraction and summarization. 698 | - str: An error message string if a significant error occurred. 699 | - None: Should ideally not be returned if error handling is robust. 700 | """ 701 | inputs = {"original_message": message} 702 | final_state = None 703 | try: 704 | # Use graph.astream for async execution 705 | async for output in graph.astream( 706 | inputs, {"recursion_limit": 15} 707 | ): # Increased recursion limit 708 | # output is a dictionary where keys are node names and values are states after the node ran 709 | # We are interested in the state *after* the last node executes 710 | node_name = list(output.keys())[0] 711 | final_state = output[node_name] # Keep track of the latest state 712 | console.print(f"Output from node '{node_name}': Updated state", style="dim") 713 | # Optional: Print intermediate state details if needed for debugging 714 | # console.print(f" State keys: {list(final_state.keys())}", style="dim") 715 | 716 | if final_state: 717 | # Debug: Print the final state (simplified) 718 | console.print("---FINAL STATE--- ", style="bold magenta") 719 | # Sort keys for consistent output order 720 | state_keys = sorted(final_state.keys()) 721 | for key in state_keys: 722 | value = final_state[key] 723 | if key == "content" and isinstance(value, str) and len(value) > 200: 724 | console.print( 725 | f" {key}: ({len(value)} chars)", style="magenta" 726 | ) 727 | elif isinstance(value, str) and len(value) > 100: 728 | console.print( 729 | f" {key}: ({len(value)} chars)", style="magenta" 730 | ) 731 | else: 732 | console.print(f" {key}: {value}", style="magenta") 733 | 734 | # Determine final result based on summary and error fields 735 | summary_text = final_state.get("summary") 736 | final_error = final_state.get("error") 737 | 738 | # 1. Successful Summary (even if there were intermediate, recoverable errors) 739 | if summary_text and isinstance(summary_text, str) and summary_text.strip(): 740 | console.print("---AGENT FINISHED: Summary--- ", style="bold green") 741 | # If an error occurred *before* summarization, but summarization *still* happened 742 | # (e.g. fallback content used), we might want to mention the error. 743 | # For now, prioritize showing the summary if available. 744 | # if final_error: 745 | # console.print(f"(Note: An earlier error occurred: {final_error})", style="yellow") 746 | return summary_text 747 | 748 | # 2. Error occurred (could be init, routing, extraction, or summarization error) 749 | elif final_error: 750 | console.print( 751 | f"---AGENT FINISHED: Error ('{final_error}')--- ", style="bold red" 752 | ) 753 | # Ensure the error message is prefixed consistently 754 | if isinstance(final_error, str) and final_error.lower().startswith( 755 | "error:" 756 | ): 757 | return final_error 758 | else: 759 | return "Error: " + str(final_error) # Ensure it's a string 760 | 761 | # 3. No Summary and No Error (Should ideally not happen with should_summarize logic, 762 | # but could occur if summarizer returns empty without error) 763 | else: 764 | console.print( 765 | "---AGENT FINISHED: No Summary/No Error--- ", style="bold yellow" 766 | ) 767 | # Provide a more specific fallback message 768 | if not final_state.get("content"): 769 | # Check if it was an unsupported URL type initially 770 | if final_state.get("route_decision") == "__unsupported__": 771 | return "Error: The provided link type is not supported or no URL was found." 772 | else: 773 | return "Error: Agent finished without extracting content." 774 | else: 775 | return "Error: Agent finished. Content was extracted, but no summary was generated and no specific error was reported." 776 | 777 | else: 778 | console.print("---AGENT FAILED: No Final State--- ", style="bold red") 779 | return "Error: Agent workflow did not produce a final state." 780 | 781 | except Exception as e: 782 | console.print("---AGENT FAILED: Runtime Exception--- ", style="bold red") 783 | console.print_exception(show_locals=False) 784 | # Ensure the exception is converted to a string for the return value 785 | return "Error: An unexpected error occurred in the agent: " + str(e) 786 | 787 | 788 | # Example usage (for testing) 789 | if __name__ == "__main__": 790 | import asyncio 791 | 792 | # --- Test Cases --- 793 | # Twitter/X URL 794 | test_url_msg_twitter = ( 795 | "Summarize this tweet: https://x.com/natolambert/status/1917928418068541520" 796 | ) 797 | # Standard Web URL 798 | test_url_msg_web = ( 799 | "Can you summarize this? https://lilianweng.github.io/posts/2023-06-23-agent/" 800 | ) 801 | # PDF URL 802 | test_url_msg_pdf = "Summarize: https://arxiv.org/pdf/2305.15334.pdf" 803 | # URL that might fail primary extraction (Tavily might fail, but router should still pick web) 804 | test_url_msg_fail = ( 805 | "What about this? https://httpbin.org/delay/5" # Example, Tavily might timeout 806 | ) 807 | # LinkedIn URL 808 | test_url_msg_linkedin = "Summarize this post: https://www.linkedin.com/posts/omarsar_llms-for-engineering-activity-7324064951734603776-Ravc?utm_source=share&utm_medium=member_desktop&rcm=ACoAABDFOm0BmXlu4cLYtJePo0mLzdFoB5itUNU" 809 | # Message without a URL (Router should pick Unsupported) 810 | test_url_msg_nourl = "Hello, how are you?" 811 | # Unsupported URL Type (Router should pick Unsupported) 812 | test_url_msg_unsupported = "Check this out: ftp://files.example.com/data.zip" 813 | # YouTube URL (Router should pick Youtube) 814 | test_url_msg_youtube = "Summarize this video: https://www.youtube.com/watch?v=n5oBmmBkW6A" # URL from youtube_scraper test 815 | # YouTube URL that requires login (Should fallback to Tavily) 816 | test_url_msg_youtube_login = ( 817 | "Summarize: https://www.youtube.com/watch?v=hhMXE9-JUAc" # Test fallback 818 | ) 819 | 820 | async def main(): 821 | test_cases = { 822 | # "Twitter": test_url_msg_twitter, 823 | # "Web": test_url_msg_web, 824 | # "PDF": test_url_msg_pdf, 825 | # "Web Fail": test_url_msg_fail, # May take time 826 | # "LinkedIn": test_url_msg_linkedin, 827 | # "No URL": test_url_msg_nourl, 828 | # "Unsupported FTP": test_url_msg_unsupported, 829 | "YouTube": test_url_msg_youtube, 830 | # "YouTube Needs Login": test_url_msg_youtube_login, # Test fallback (AgentQL should handle public ones) 831 | } 832 | 833 | for name, msg in test_cases.items(): 834 | print(f"\n{'=/' * 10} RUNNING TEST: {name} {'=/' * 10}") 835 | print(f"Input message: {msg}") 836 | result = await run_agent(msg) 837 | print("\n--- FINAL RESULT --- ") 838 | if result: 839 | # Ensure result is treated as a string before printing 840 | print(str(result)) 841 | else: 842 | # Handle the case where run_agent might return None (though it aims not to) 843 | print("Agent returned None or an empty result.") 844 | print(f"{'=/' * 10} FINISHED TEST: {name} {'=/' * 10}\n") 845 | 846 | asyncio.run(main()) 847 | -------------------------------------------------------------------------------- /agent_viz.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import marimo 4 | 5 | __generated_with = "0.13.2" 6 | app = marimo.App(width="medium") 7 | 8 | 9 | @app.cell 10 | def _(): 11 | import marimo as mo 12 | import nest_asyncio 13 | 14 | nest_asyncio.apply() # Allow nested asyncio loops 15 | 16 | # --- The rest of your imports --- 17 | from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles 18 | from agent import build_graph # Your import 19 | # --------------------------------- 20 | 21 | app = build_graph() 22 | 23 | # Now this should work without the RuntimeError 24 | png_bytes = app.get_graph().draw_mermaid_png( 25 | draw_method=MermaidDrawMethod.PYPPETEER, 26 | ) 27 | 28 | # Display the image using marimo 29 | mo.image(src=png_bytes, alt="LangGraph Flow", caption="Telegram Summarizer Bot Graph") 30 | return 31 | 32 | 33 | @app.cell 34 | def _(): 35 | return 36 | 37 | 38 | if __name__ == "__main__": 39 | app.run() 40 | -------------------------------------------------------------------------------- /baml_client/__init__.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | __version__ = "0.88.0" 17 | 18 | try: 19 | from baml_py.safe_import import EnsureBamlPyImport 20 | except ImportError: 21 | raise ImportError(f"""Update to baml-py required. 22 | Version of baml_client generator (see generators.baml): {__version__} 23 | 24 | Please upgrade baml-py to version "{__version__}". 25 | 26 | $ pip install baml-py=={__version__} 27 | $ uv add baml-py=={__version__} 28 | 29 | If nothing else works, please ask for help: 30 | 31 | https://github.com/boundaryml/baml/issues 32 | https://boundaryml.com/discord 33 | """) from None 34 | 35 | with EnsureBamlPyImport(__version__) as e: 36 | e.raise_if_incompatible_version(__version__) 37 | 38 | from . import types 39 | from . import tracing 40 | from . import partial_types 41 | from . import config 42 | from .config import reset_baml_env_vars 43 | 44 | from .sync_client import b 45 | 46 | 47 | __all__ = [ 48 | "b", 49 | "partial_types", 50 | "tracing", 51 | "types", 52 | "reset_baml_env_vars", 53 | "config", 54 | ] -------------------------------------------------------------------------------- /baml_client/async_client.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from typing import Any, Dict, List, Optional, TypeVar, Union, TypedDict, Type, Literal, cast 17 | from typing_extensions import NotRequired 18 | import pprint 19 | 20 | import baml_py 21 | from pydantic import BaseModel, ValidationError, create_model 22 | 23 | from . import partial_types, types 24 | from .types import Checked, Check 25 | from .type_builder import TypeBuilder 26 | from .parser import LlmResponseParser, LlmStreamParser 27 | from .async_request import AsyncHttpRequest, AsyncHttpStreamRequest 28 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME 29 | 30 | OutputType = TypeVar('OutputType') 31 | 32 | 33 | # Define the TypedDict with optional parameters having default values 34 | class BamlCallOptions(TypedDict, total=False): 35 | tb: NotRequired[TypeBuilder] 36 | client_registry: NotRequired[baml_py.baml_py.ClientRegistry] 37 | collector: NotRequired[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]] 38 | 39 | 40 | class BamlAsyncClient: 41 | __runtime: baml_py.BamlRuntime 42 | __ctx_manager: baml_py.BamlCtxManager 43 | __stream_client: "BamlStreamClient" 44 | __http_request: "AsyncHttpRequest" 45 | __http_stream_request: "AsyncHttpStreamRequest" 46 | __llm_response_parser: LlmResponseParser 47 | __llm_stream_parser: LlmStreamParser 48 | __baml_options: BamlCallOptions 49 | 50 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None): 51 | self.__runtime = runtime 52 | self.__ctx_manager = ctx_manager 53 | self.__stream_client = BamlStreamClient(self.__runtime, self.__ctx_manager, baml_options) 54 | self.__http_request = AsyncHttpRequest(self.__runtime, self.__ctx_manager) 55 | self.__http_stream_request = AsyncHttpStreamRequest(self.__runtime, self.__ctx_manager) 56 | self.__llm_response_parser = LlmResponseParser(self.__runtime, self.__ctx_manager) 57 | self.__llm_stream_parser = LlmStreamParser(self.__runtime, self.__ctx_manager) 58 | self.__baml_options = baml_options or {} 59 | 60 | def with_options( 61 | self, 62 | tb: Optional[TypeBuilder] = None, 63 | client_registry: Optional[baml_py.baml_py.ClientRegistry] = None, 64 | collector: Optional[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]] = None, 65 | ) -> "BamlAsyncClient": 66 | """ 67 | Returns a new instance of BamlAsyncClient with explicitly typed baml options 68 | for Python 3.8 compatibility. 69 | """ 70 | new_options = self.__baml_options.copy() 71 | 72 | # Override if any keyword arguments were provided. 73 | if tb is not None: 74 | new_options["tb"] = tb 75 | if client_registry is not None: 76 | new_options["client_registry"] = client_registry 77 | if collector is not None: 78 | new_options["collector"] = collector 79 | 80 | return BamlAsyncClient(self.__runtime, self.__ctx_manager, new_options) 81 | 82 | @property 83 | def stream(self): 84 | return self.__stream_client 85 | 86 | @property 87 | def request(self): 88 | return self.__http_request 89 | 90 | @property 91 | def stream_request(self): 92 | return self.__http_stream_request 93 | 94 | @property 95 | def parse(self): 96 | return self.__llm_response_parser 97 | 98 | @property 99 | def parse_stream(self): 100 | return self.__llm_stream_parser 101 | 102 | 103 | async def RouteRequest( 104 | self, 105 | original_message: str, 106 | baml_options: BamlCallOptions = {}, 107 | ) -> types.ExtractorTool: 108 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 109 | 110 | __tb__ = options.get("tb", None) 111 | if __tb__ is not None: 112 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 113 | else: 114 | tb = None 115 | __cr__ = options.get("client_registry", None) 116 | collector = options.get("collector", None) 117 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 118 | raw = await self.__runtime.call_function( 119 | "RouteRequest", 120 | { 121 | "original_message": original_message, 122 | }, 123 | self.__ctx_manager.get(), 124 | tb, 125 | __cr__, 126 | collectors, 127 | ) 128 | return cast(types.ExtractorTool, raw.cast_to(types, types, partial_types, False)) 129 | 130 | async def SummarizeContent( 131 | self, 132 | content: str,content_type: types.ContentType,context: Optional[str], 133 | baml_options: BamlCallOptions = {}, 134 | ) -> types.Summary: 135 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 136 | 137 | __tb__ = options.get("tb", None) 138 | if __tb__ is not None: 139 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 140 | else: 141 | tb = None 142 | __cr__ = options.get("client_registry", None) 143 | collector = options.get("collector", None) 144 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 145 | raw = await self.__runtime.call_function( 146 | "SummarizeContent", 147 | { 148 | "content": content,"content_type": content_type,"context": context, 149 | }, 150 | self.__ctx_manager.get(), 151 | tb, 152 | __cr__, 153 | collectors, 154 | ) 155 | return cast(types.Summary, raw.cast_to(types, types, partial_types, False)) 156 | 157 | 158 | 159 | class BamlStreamClient: 160 | __runtime: baml_py.BamlRuntime 161 | __ctx_manager: baml_py.BamlCtxManager 162 | __baml_options: BamlCallOptions 163 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None): 164 | self.__runtime = runtime 165 | self.__ctx_manager = ctx_manager 166 | self.__baml_options = baml_options or {} 167 | 168 | 169 | def RouteRequest( 170 | self, 171 | original_message: str, 172 | baml_options: BamlCallOptions = {}, 173 | ) -> baml_py.BamlStream[Optional[types.ExtractorTool], types.ExtractorTool]: 174 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 175 | __tb__ = options.get("tb", None) 176 | if __tb__ is not None: 177 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 178 | else: 179 | tb = None 180 | __cr__ = options.get("client_registry", None) 181 | collector = options.get("collector", None) 182 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 183 | raw = self.__runtime.stream_function( 184 | "RouteRequest", 185 | { 186 | "original_message": original_message, 187 | }, 188 | None, 189 | self.__ctx_manager.get(), 190 | tb, 191 | __cr__, 192 | collectors, 193 | ) 194 | 195 | return baml_py.BamlStream[Optional[types.ExtractorTool], types.ExtractorTool]( 196 | raw, 197 | lambda x: cast(Optional[types.ExtractorTool], x.cast_to(types, types, partial_types, True)), 198 | lambda x: cast(types.ExtractorTool, x.cast_to(types, types, partial_types, False)), 199 | self.__ctx_manager.get(), 200 | ) 201 | 202 | def SummarizeContent( 203 | self, 204 | content: str,content_type: types.ContentType,context: Optional[str], 205 | baml_options: BamlCallOptions = {}, 206 | ) -> baml_py.BamlStream[partial_types.Summary, types.Summary]: 207 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 208 | __tb__ = options.get("tb", None) 209 | if __tb__ is not None: 210 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 211 | else: 212 | tb = None 213 | __cr__ = options.get("client_registry", None) 214 | collector = options.get("collector", None) 215 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 216 | raw = self.__runtime.stream_function( 217 | "SummarizeContent", 218 | { 219 | "content": content, 220 | "content_type": content_type, 221 | "context": context, 222 | }, 223 | None, 224 | self.__ctx_manager.get(), 225 | tb, 226 | __cr__, 227 | collectors, 228 | ) 229 | 230 | return baml_py.BamlStream[partial_types.Summary, types.Summary]( 231 | raw, 232 | lambda x: cast(partial_types.Summary, x.cast_to(types, types, partial_types, True)), 233 | lambda x: cast(types.Summary, x.cast_to(types, types, partial_types, False)), 234 | self.__ctx_manager.get(), 235 | ) 236 | 237 | 238 | 239 | b = BamlAsyncClient(DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX) 240 | 241 | __all__ = ["b"] -------------------------------------------------------------------------------- /baml_client/async_request.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from typing import Any, Dict, List, Optional, Union, TypedDict, Type, Literal 17 | from typing_extensions import NotRequired 18 | 19 | import baml_py 20 | 21 | from . import types 22 | from .types import Checked, Check 23 | from .type_builder import TypeBuilder 24 | 25 | 26 | class BamlCallOptions(TypedDict, total=False): 27 | tb: NotRequired[TypeBuilder] 28 | client_registry: NotRequired[baml_py.baml_py.ClientRegistry] 29 | 30 | 31 | class AsyncHttpRequest: 32 | __runtime: baml_py.BamlRuntime 33 | __ctx_manager: baml_py.BamlCtxManager 34 | 35 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager): 36 | self.__runtime = runtime 37 | self.__ctx_manager = ctx_manager 38 | 39 | 40 | async def RouteRequest( 41 | self, 42 | original_message: str, 43 | baml_options: BamlCallOptions = {}, 44 | ) -> baml_py.HTTPRequest: 45 | __tb__ = baml_options.get("tb", None) 46 | if __tb__ is not None: 47 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 48 | else: 49 | tb = None 50 | __cr__ = baml_options.get("client_registry", None) 51 | 52 | return await self.__runtime.build_request( 53 | "RouteRequest", 54 | { 55 | "original_message": original_message, 56 | }, 57 | self.__ctx_manager.get(), 58 | tb, 59 | __cr__, 60 | False, 61 | ) 62 | 63 | async def SummarizeContent( 64 | self, 65 | content: str,content_type: types.ContentType,context: Optional[str], 66 | baml_options: BamlCallOptions = {}, 67 | ) -> baml_py.HTTPRequest: 68 | __tb__ = baml_options.get("tb", None) 69 | if __tb__ is not None: 70 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 71 | else: 72 | tb = None 73 | __cr__ = baml_options.get("client_registry", None) 74 | 75 | return await self.__runtime.build_request( 76 | "SummarizeContent", 77 | { 78 | "content": content, 79 | "content_type": content_type, 80 | "context": context, 81 | }, 82 | self.__ctx_manager.get(), 83 | tb, 84 | __cr__, 85 | False, 86 | ) 87 | 88 | 89 | 90 | class AsyncHttpStreamRequest: 91 | __runtime: baml_py.BamlRuntime 92 | __ctx_manager: baml_py.BamlCtxManager 93 | 94 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager): 95 | self.__runtime = runtime 96 | self.__ctx_manager = ctx_manager 97 | 98 | 99 | async def RouteRequest( 100 | self, 101 | original_message: str, 102 | baml_options: BamlCallOptions = {}, 103 | ) -> baml_py.HTTPRequest: 104 | __tb__ = baml_options.get("tb", None) 105 | if __tb__ is not None: 106 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 107 | else: 108 | tb = None 109 | __cr__ = baml_options.get("client_registry", None) 110 | 111 | return await self.__runtime.build_request( 112 | "RouteRequest", 113 | { 114 | "original_message": original_message, 115 | }, 116 | self.__ctx_manager.get(), 117 | tb, 118 | __cr__, 119 | True, 120 | ) 121 | 122 | async def SummarizeContent( 123 | self, 124 | content: str,content_type: types.ContentType,context: Optional[str], 125 | baml_options: BamlCallOptions = {}, 126 | ) -> baml_py.HTTPRequest: 127 | __tb__ = baml_options.get("tb", None) 128 | if __tb__ is not None: 129 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 130 | else: 131 | tb = None 132 | __cr__ = baml_options.get("client_registry", None) 133 | 134 | return await self.__runtime.build_request( 135 | "SummarizeContent", 136 | { 137 | "content": content, 138 | "content_type": content_type, 139 | "context": context, 140 | }, 141 | self.__ctx_manager.get(), 142 | tb, 143 | __cr__, 144 | True, 145 | ) 146 | 147 | 148 | 149 | __all__ = ["AsyncHttpRequest", "AsyncHttpStreamRequest"] -------------------------------------------------------------------------------- /baml_client/config.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from baml_py.logging import set_log_level, get_log_level, set_log_json_mode, set_log_max_chunk_length 17 | from .globals import reset_baml_env_vars 18 | 19 | __all__ = ["set_log_level", "get_log_level", "set_log_json_mode", "reset_baml_env_vars", "set_log_max_chunk_length"] -------------------------------------------------------------------------------- /baml_client/globals.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from __future__ import annotations 17 | import os 18 | 19 | from baml_py import BamlCtxManager, BamlRuntime 20 | from baml_py.baml_py import BamlError 21 | from .inlinedbaml import get_baml_files 22 | from typing_extensions import Literal 23 | from typing import Dict, Any 24 | 25 | DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME = BamlRuntime.from_files( 26 | "baml_src", 27 | get_baml_files(), 28 | os.environ.copy() 29 | ) 30 | DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX = BamlCtxManager(DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME) 31 | 32 | def reset_baml_env_vars(env_vars: Dict[str, str]): 33 | if DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.allow_reset(): 34 | DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME.reset( 35 | "baml_src", 36 | get_baml_files(), 37 | env_vars 38 | ) 39 | DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.reset() 40 | else: 41 | raise BamlError("Cannot reset BAML environment variables while there are active BAML contexts.") 42 | 43 | try: 44 | import dotenv 45 | from unittest.mock import patch 46 | 47 | # Monkeypatch load_dotenv to call reset_baml_env_vars after execution 48 | original_load_dotenv = dotenv.load_dotenv 49 | 50 | def patched_load_dotenv(*args: Any, **kwargs: Any) -> Any: 51 | result = original_load_dotenv(*args, **kwargs) 52 | try: 53 | reset_baml_env_vars(os.environ.copy()) 54 | except BamlError: 55 | # swallow the error 56 | pass 57 | return result 58 | 59 | patch('dotenv.load_dotenv', patched_load_dotenv).start() 60 | except ImportError: 61 | # dotenv is not installed, so we do nothing 62 | pass 63 | 64 | __all__ = [] -------------------------------------------------------------------------------- /baml_client/inlinedbaml.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | 17 | file_map = { 18 | 19 | "clients.baml": "client Gemini2_5_flash {\n provider google-ai\n options {\n model gemini-2.5-flash-preview-04-17\n api_key env.GEMINI_API_KEY\n }\n}\n\nclient Gemini2_5_pro {\n provider google-ai\n options {\n model gemini-2.5-pro-exp-03-25\n api_key env.GEMINI_API_KEY\n }\n}\n\nclient DeepSeekR1 {\n provider \"openai\"\n options {\n api_key env.DEEPSEEK_API_KEY\n base_url \"https://api.deepseek.com\"\n model \"deepseek-reasoner\"\n }\n}\n\nclient DeepSeekV3 {\n provider \"openai\"\n options {\n api_key env.DEEPSEEK_API_KEY\n base_url \"https://api.deepseek.com\"\n model \"deepseek-chat\"\n temperature 0.1\n }\n}\n\n// https://docs.boundaryml.com/docs/snippets/clients/fallback\nclient LLMFallback {\n provider fallback\n options {\n // This will try the clients in order until one succeeds\n strategy [DeepSeekV3, Gemini2_5_flash]\n }\n}\n\n// https://docs.boundaryml.com/docs/snippets/clients/retry\nretry_policy Constant {\n max_retries 3\n // Strategy is optional\n strategy {\n type constant_delay\n delay_ms 200\n }\n}\n\nretry_policy Exponential {\n max_retries 2\n // Strategy is optional\n strategy {\n type exponential_backoff\n delay_ms 300\n multiplier 1.5\n max_delay_ms 10000\n }\n}", 20 | "generators.baml": "// This helps use auto generate libraries you can use in the language of\n// your choice. You can have multiple generators if you use multiple languages.\n// Just ensure that the output_dir is different for each generator.\ngenerator target {\n // Valid values: \"python/pydantic\", \"typescript\", \"ruby/sorbet\", \"rest/openapi\"\n output_type \"python/pydantic\"\n\n // Where the generated code will be saved (relative to baml_src/)\n output_dir \"../\"\n\n // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml).\n // The BAML VSCode extension version should also match this version.\n version \"0.88.0\"\n\n // Valid values: \"sync\", \"async\"\n // This controls what `b.FunctionName()` will be (sync or async).\n default_client_mode sync\n}\n", 21 | "router.baml": "// Define the possible extraction tools\nenum ExtractorTool {\n WebpageExtractor // For general webpages\n PDFExtractor // For PDF documents\n TwitterExtractor // For Twitter/X URLs\n LinkedInExtractor // For LinkedIn post URLs\n Unsupported // For URLs or content types we cannot handle\n YoutubeExtractor // For YouTube video URLs\n}\n\n// Define the router function\n// It takes the original message and decides which tool to use.\nfunction RouteRequest(original_message: string) -> ExtractorTool {\n // Use a capable but fast client for routing\n// client Gemini2_5_flash\n client LLMFallback\n\n prompt #\"\n Analyze the following user message and determine the best tool to use for extracting content from any URL present.\n\n User Message:\n ---\n {{ original_message }}\n ---\n\n Identify the primary URL in the message. Based *only* on the URL's structure or file extension, choose one of the following tools:\n\n - If the URL points to a PDF file (ends with .pdf), choose PDFExtractor.\n - If the URL is from Twitter or X (contains twitter.com or x.com), choose TwitterExtractor.\n - If the URL is a LinkedIn post (contains linkedin.com/posts/), choose LinkedInExtractor.\n - If the URL is a YouTube video (contains youtube.com/watch or youtu.be/), choose YoutubeExtractor.\n - For all other standard web URLs (http or https), choose WebpageExtractor.\n - If no URL is found, or the URL type is clearly unsupported (e.g., ftp://, mailto:), choose Unsupported.\n\n Output *only* the name of the chosen tool from the 'ExtractorTool' enum.\n\n {{ ctx.output_format }}\n \"#\n}\n\n// Optional test case\ntest RouteWebpage {\n functions [RouteRequest]\n args {\n original_message #\"Summarize this blog post: https://example.com/blog/article-123\"#\n }\n}\n\ntest RoutePDF {\n functions [RouteRequest]\n args {\n original_message #\"Can you process this PDF? https://arxiv.org/pdf/2401.0001.pdf\"#\n }\n}\n\ntest RouteTwitter {\n functions [RouteRequest]\n args {\n original_message #\"Look at this thread: https://x.com/user/status/12345\"#\n }\n}\n\ntest RouteLinkedIn {\n functions [RouteRequest]\n args {\n original_message #\"Interesting thoughts here: https://www.linkedin.com/posts/someuser_activity-1234567890-abcd?utm_source=share\"#\n }\n}\n\ntest RouteNoURL {\n functions [RouteRequest]\n args {\n original_message #\"Tell me a joke.\"#\n }\n}\n\ntest RouteUnsupported {\n functions [RouteRequest]\n args {\n original_message #\"Check this out: ftp://files.example.com/data.zip\"#\n }\n}\n\ntest RouteYouTube {\n functions [RouteRequest]\n args {\n original_message #\"Summarize this video: https://www.youtube.com/watch?v=dQw4w9WgXcQ\"#\n }\n}\n\n", 22 | "summarize.baml": "\n// Define an enum for the type of content being summarized\nenum ContentType {\n Webpage\n PDF\n GenericText\n}\n\n// Define the structure for the summarization output\nclass Summary {\n title string @description(\"A concise and informative title for the summarized content(max 10 words).\")\n key_points string[] @description(\"A list of the most important points or takeaways from the content. (3-5 points)\")\n concise_summary string @description(\"A brief paragraph summarizing the entire content. (50-100 words)\")\n}\n\n// Define the main summarization function\n// This function handles shorter texts directly or uses context for RAG-based summaries.\nfunction SummarizeContent(content: string, content_type: ContentType, context: string?) -> Summary {\n client LLMFallback\n prompt #\"\n You are an expert summarization engine. Your goal is to provide a clear and concise summary of the given text.\n\n Content Type: {{ content_type }}\n {% if context %}\n Relevant Context (from RAG):\n ---\n {{ context }}\n ---\n {% endif %}\n\n Original Content:\n ---\n {{ content }}\n ---\n\n Based *only* on the provided Original Content {% if context %}and the Relevant Context{% endif %}, generate the answer.\n\n Format your response strictly as the 'Summary' class structure. Ensure the title, key points, and summary are distinct and accurately reflect the source material. \n Do not include any information not present in the provided text or context.\n\n # Instructions\n - If the long is for a paper, you need to explain what the paper is trying to solve and how, in separate sections: \n '## What is the problem the paper is trying to solve?'\n '## How does the paper attempt to solve the problem?'\n - If it's a blog post or webpage, you have to explain like: 'This post or blog or webpage is about ...'\n - If it's a github repo, you have to explain like: 'This github repo is about ... and tries to solve .... It uses ...'\n - If it's an arxive or any other paper, do not mention info about DIO or under process or stuff like that. Just mentione the main points about the paper.\n \n ----\n {{ ctx.output_format}}\n \"#\n}\n\ntest SummarizeTest {\n functions [SummarizeContent]\n args {\n content #\"\n The Urgency of Interpretability\nApril 2025\nIn the decade that I have been working on AI, I’ve watched it grow from a tiny academic field to arguably the most important economic and geopolitical issue in the world. In all that time, perhaps the most important lesson I’ve learned is this: the progress of the underlying technology is inexorable, driven by forces too powerful to stop, but the way in which it happens—the order in which things are built, the applications we choose, and the details of how it is rolled out to society—are eminently possible to change, and it’s possible to have great positive impact by doing so. We can’t stop the bus, but we can steer it. In the past I’ve written about the importance of deploying AI in a way that is positive for the world, and of ensuring that democracies build and wield the technology before autocracies do. Over the last few months, I have become increasingly focused on an additional opportunity for steering the bus: the tantalizing possibility, opened up by some recent advances, that we could succeed at interpretability—that is, in understanding the inner workings of AI systems—before models reach an overwhelming level of power.\n \"#\n content_type #\"Webpage\"#\n }\n}\n", 23 | } 24 | 25 | def get_baml_files(): 26 | return file_map -------------------------------------------------------------------------------- /baml_client/parser.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from typing import Any, Dict, List, Optional, Union, TypedDict, Type, Literal, cast 17 | from typing_extensions import NotRequired 18 | 19 | import baml_py 20 | 21 | from . import types, partial_types 22 | from .types import Checked, Check 23 | from .type_builder import TypeBuilder 24 | 25 | 26 | class BamlCallOptions(TypedDict, total=False): 27 | tb: NotRequired[TypeBuilder] 28 | client_registry: NotRequired[baml_py.baml_py.ClientRegistry] 29 | 30 | 31 | class LlmResponseParser: 32 | __runtime: baml_py.BamlRuntime 33 | __ctx_manager: baml_py.BamlCtxManager 34 | 35 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager): 36 | self.__runtime = runtime 37 | self.__ctx_manager = ctx_manager 38 | 39 | 40 | def RouteRequest( 41 | self, 42 | llm_response: str, 43 | baml_options: BamlCallOptions = {}, 44 | ) -> types.ExtractorTool: 45 | __tb__ = baml_options.get("tb", None) 46 | if __tb__ is not None: 47 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 48 | else: 49 | tb = None 50 | __cr__ = baml_options.get("client_registry", None) 51 | 52 | parsed = self.__runtime.parse_llm_response( 53 | "RouteRequest", 54 | llm_response, 55 | types, 56 | types, 57 | partial_types, 58 | False, 59 | self.__ctx_manager.get(), 60 | tb, 61 | __cr__, 62 | ) 63 | 64 | return cast(types.ExtractorTool, parsed) 65 | 66 | def SummarizeContent( 67 | self, 68 | llm_response: str, 69 | baml_options: BamlCallOptions = {}, 70 | ) -> types.Summary: 71 | __tb__ = baml_options.get("tb", None) 72 | if __tb__ is not None: 73 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 74 | else: 75 | tb = None 76 | __cr__ = baml_options.get("client_registry", None) 77 | 78 | parsed = self.__runtime.parse_llm_response( 79 | "SummarizeContent", 80 | llm_response, 81 | types, 82 | types, 83 | partial_types, 84 | False, 85 | self.__ctx_manager.get(), 86 | tb, 87 | __cr__, 88 | ) 89 | 90 | return cast(types.Summary, parsed) 91 | 92 | 93 | 94 | class LlmStreamParser: 95 | __runtime: baml_py.BamlRuntime 96 | __ctx_manager: baml_py.BamlCtxManager 97 | 98 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager): 99 | self.__runtime = runtime 100 | self.__ctx_manager = ctx_manager 101 | 102 | 103 | def RouteRequest( 104 | self, 105 | llm_response: str, 106 | baml_options: BamlCallOptions = {}, 107 | ) -> Optional[types.ExtractorTool]: 108 | __tb__ = baml_options.get("tb", None) 109 | if __tb__ is not None: 110 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 111 | else: 112 | tb = None 113 | __cr__ = baml_options.get("client_registry", None) 114 | 115 | parsed = self.__runtime.parse_llm_response( 116 | "RouteRequest", 117 | llm_response, 118 | types, 119 | types, 120 | partial_types, 121 | True, 122 | self.__ctx_manager.get(), 123 | tb, 124 | __cr__, 125 | ) 126 | 127 | return cast(Optional[types.ExtractorTool], parsed) 128 | 129 | def SummarizeContent( 130 | self, 131 | llm_response: str, 132 | baml_options: BamlCallOptions = {}, 133 | ) -> partial_types.Summary: 134 | __tb__ = baml_options.get("tb", None) 135 | if __tb__ is not None: 136 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 137 | else: 138 | tb = None 139 | __cr__ = baml_options.get("client_registry", None) 140 | 141 | parsed = self.__runtime.parse_llm_response( 142 | "SummarizeContent", 143 | llm_response, 144 | types, 145 | types, 146 | partial_types, 147 | True, 148 | self.__ctx_manager.get(), 149 | tb, 150 | __cr__, 151 | ) 152 | 153 | return cast(partial_types.Summary, parsed) 154 | 155 | 156 | 157 | __all__ = ["LlmResponseParser", "LlmStreamParser"] -------------------------------------------------------------------------------- /baml_client/partial_types.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | import baml_py 17 | from enum import Enum 18 | from pydantic import BaseModel, ConfigDict 19 | from typing_extensions import TypeAlias 20 | from typing import Dict, Generic, List, Optional, TypeVar, Union, Literal 21 | 22 | from . import types 23 | from .types import Checked, Check 24 | 25 | ############################################################################### 26 | # 27 | # These types are used for streaming, for when an instance of a type 28 | # is still being built up and any of its fields is not yet fully available. 29 | # 30 | ############################################################################### 31 | 32 | T = TypeVar('T') 33 | class StreamState(BaseModel, Generic[T]): 34 | value: T 35 | state: Literal["Pending", "Incomplete", "Complete"] 36 | 37 | 38 | class Summary(BaseModel): 39 | title: Optional[str] = None 40 | key_points: List[str] 41 | concise_summary: Optional[str] = None 42 | -------------------------------------------------------------------------------- /baml_client/sync_client.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from typing import Any, Dict, List, Optional, TypeVar, Union, TypedDict, Type, Literal, cast 17 | from typing_extensions import NotRequired 18 | import pprint 19 | 20 | import baml_py 21 | from pydantic import BaseModel, ValidationError, create_model 22 | 23 | from . import partial_types, types 24 | from .types import Checked, Check 25 | from .type_builder import TypeBuilder 26 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME 27 | from .sync_request import HttpRequest, HttpStreamRequest 28 | from .parser import LlmResponseParser, LlmStreamParser 29 | 30 | OutputType = TypeVar('OutputType') 31 | 32 | 33 | # Define the TypedDict with optional parameters having default values 34 | class BamlCallOptions(TypedDict, total=False): 35 | tb: NotRequired[TypeBuilder] 36 | client_registry: NotRequired[baml_py.baml_py.ClientRegistry] 37 | collector: NotRequired[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]] 38 | 39 | 40 | class BamlSyncClient: 41 | __runtime: baml_py.BamlRuntime 42 | __ctx_manager: baml_py.BamlCtxManager 43 | __stream_client: "BamlStreamClient" 44 | __http_request: "HttpRequest" 45 | __http_stream_request: "HttpStreamRequest" 46 | __llm_response_parser: LlmResponseParser 47 | __baml_options: BamlCallOptions 48 | 49 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None): 50 | self.__runtime = runtime 51 | self.__ctx_manager = ctx_manager 52 | self.__stream_client = BamlStreamClient(self.__runtime, self.__ctx_manager, baml_options) 53 | self.__http_request = HttpRequest(self.__runtime, self.__ctx_manager) 54 | self.__http_stream_request = HttpStreamRequest(self.__runtime, self.__ctx_manager) 55 | self.__llm_response_parser = LlmResponseParser(self.__runtime, self.__ctx_manager) 56 | self.__llm_stream_parser = LlmStreamParser(self.__runtime, self.__ctx_manager) 57 | self.__baml_options = baml_options or {} 58 | 59 | @property 60 | def stream(self): 61 | return self.__stream_client 62 | 63 | @property 64 | def request(self): 65 | return self.__http_request 66 | 67 | @property 68 | def stream_request(self): 69 | return self.__http_stream_request 70 | 71 | @property 72 | def parse(self): 73 | return self.__llm_response_parser 74 | 75 | @property 76 | def parse_stream(self): 77 | return self.__llm_stream_parser 78 | 79 | def with_options( 80 | self, 81 | tb: Optional[TypeBuilder] = None, 82 | client_registry: Optional[baml_py.baml_py.ClientRegistry] = None, 83 | collector: Optional[Union[baml_py.baml_py.Collector, List[baml_py.baml_py.Collector]]] = None, 84 | ) -> "BamlSyncClient": 85 | """ 86 | Returns a new instance of BamlSyncClient with explicitly typed baml options 87 | for Python 3.8 compatibility. 88 | """ 89 | new_options: BamlCallOptions = self.__baml_options.copy() 90 | 91 | # Override if any keyword arguments were provided. 92 | if tb is not None: 93 | new_options["tb"] = tb 94 | if client_registry is not None: 95 | new_options["client_registry"] = client_registry 96 | if collector is not None: 97 | new_options["collector"] = collector 98 | return BamlSyncClient(self.__runtime, self.__ctx_manager, new_options) 99 | 100 | 101 | def RouteRequest( 102 | self, 103 | original_message: str, 104 | baml_options: BamlCallOptions = {}, 105 | ) -> types.ExtractorTool: 106 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 107 | __tb__ = options.get("tb", None) 108 | if __tb__ is not None: 109 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 110 | else: 111 | tb = None 112 | __cr__ = options.get("client_registry", None) 113 | collector = options.get("collector", None) 114 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 115 | 116 | raw = self.__runtime.call_function_sync( 117 | "RouteRequest", 118 | { 119 | "original_message": original_message, 120 | }, 121 | self.__ctx_manager.get(), 122 | tb, 123 | __cr__, 124 | collectors, 125 | ) 126 | return cast(types.ExtractorTool, raw.cast_to(types, types, partial_types, False)) 127 | 128 | def SummarizeContent( 129 | self, 130 | content: str,content_type: types.ContentType,context: Optional[str], 131 | baml_options: BamlCallOptions = {}, 132 | ) -> types.Summary: 133 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 134 | __tb__ = options.get("tb", None) 135 | if __tb__ is not None: 136 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 137 | else: 138 | tb = None 139 | __cr__ = options.get("client_registry", None) 140 | collector = options.get("collector", None) 141 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 142 | 143 | raw = self.__runtime.call_function_sync( 144 | "SummarizeContent", 145 | { 146 | "content": content,"content_type": content_type,"context": context, 147 | }, 148 | self.__ctx_manager.get(), 149 | tb, 150 | __cr__, 151 | collectors, 152 | ) 153 | return cast(types.Summary, raw.cast_to(types, types, partial_types, False)) 154 | 155 | 156 | 157 | 158 | class BamlStreamClient: 159 | __runtime: baml_py.BamlRuntime 160 | __ctx_manager: baml_py.BamlCtxManager 161 | __baml_options: BamlCallOptions 162 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager, baml_options: Optional[BamlCallOptions] = None): 163 | self.__runtime = runtime 164 | self.__ctx_manager = ctx_manager 165 | self.__baml_options = baml_options or {} 166 | 167 | 168 | def RouteRequest( 169 | self, 170 | original_message: str, 171 | baml_options: BamlCallOptions = {}, 172 | ) -> baml_py.BamlSyncStream[Optional[types.ExtractorTool], types.ExtractorTool]: 173 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 174 | __tb__ = options.get("tb", None) 175 | if __tb__ is not None: 176 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 177 | else: 178 | tb = None 179 | __cr__ = options.get("client_registry", None) 180 | collector = options.get("collector", None) 181 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 182 | 183 | raw = self.__runtime.stream_function_sync( 184 | "RouteRequest", 185 | { 186 | "original_message": original_message, 187 | }, 188 | None, 189 | self.__ctx_manager.get(), 190 | tb, 191 | __cr__, 192 | collectors, 193 | ) 194 | 195 | return baml_py.BamlSyncStream[Optional[types.ExtractorTool], types.ExtractorTool]( 196 | raw, 197 | lambda x: cast(Optional[types.ExtractorTool], x.cast_to(types, types, partial_types, True)), 198 | lambda x: cast(types.ExtractorTool, x.cast_to(types, types, partial_types, False)), 199 | self.__ctx_manager.get(), 200 | ) 201 | 202 | def SummarizeContent( 203 | self, 204 | content: str,content_type: types.ContentType,context: Optional[str], 205 | baml_options: BamlCallOptions = {}, 206 | ) -> baml_py.BamlSyncStream[partial_types.Summary, types.Summary]: 207 | options: BamlCallOptions = {**self.__baml_options, **(baml_options or {})} 208 | __tb__ = options.get("tb", None) 209 | if __tb__ is not None: 210 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 211 | else: 212 | tb = None 213 | __cr__ = options.get("client_registry", None) 214 | collector = options.get("collector", None) 215 | collectors = collector if isinstance(collector, list) else [collector] if collector is not None else [] 216 | 217 | raw = self.__runtime.stream_function_sync( 218 | "SummarizeContent", 219 | { 220 | "content": content, 221 | "content_type": content_type, 222 | "context": context, 223 | }, 224 | None, 225 | self.__ctx_manager.get(), 226 | tb, 227 | __cr__, 228 | collectors, 229 | ) 230 | 231 | return baml_py.BamlSyncStream[partial_types.Summary, types.Summary]( 232 | raw, 233 | lambda x: cast(partial_types.Summary, x.cast_to(types, types, partial_types, True)), 234 | lambda x: cast(types.Summary, x.cast_to(types, types, partial_types, False)), 235 | self.__ctx_manager.get(), 236 | ) 237 | 238 | 239 | 240 | b = BamlSyncClient(DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME, DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX) 241 | 242 | __all__ = ["b"] -------------------------------------------------------------------------------- /baml_client/sync_request.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from typing import Any, Dict, List, Optional, Union, TypedDict, Type, Literal 17 | from typing_extensions import NotRequired 18 | 19 | import baml_py 20 | 21 | from . import types 22 | from .types import Checked, Check 23 | from .type_builder import TypeBuilder 24 | 25 | 26 | class BamlCallOptions(TypedDict, total=False): 27 | tb: NotRequired[TypeBuilder] 28 | client_registry: NotRequired[baml_py.baml_py.ClientRegistry] 29 | 30 | 31 | class HttpRequest: 32 | __runtime: baml_py.BamlRuntime 33 | __ctx_manager: baml_py.BamlCtxManager 34 | 35 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager): 36 | self.__runtime = runtime 37 | self.__ctx_manager = ctx_manager 38 | 39 | 40 | def RouteRequest( 41 | self, 42 | original_message: str, 43 | baml_options: BamlCallOptions = {}, 44 | ) -> baml_py.HTTPRequest: 45 | __tb__ = baml_options.get("tb", None) 46 | if __tb__ is not None: 47 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 48 | else: 49 | tb = None 50 | __cr__ = baml_options.get("client_registry", None) 51 | 52 | return self.__runtime.build_request_sync( 53 | "RouteRequest", 54 | { 55 | "original_message": original_message, 56 | }, 57 | self.__ctx_manager.get(), 58 | tb, 59 | __cr__, 60 | False, 61 | ) 62 | 63 | def SummarizeContent( 64 | self, 65 | content: str,content_type: types.ContentType,context: Optional[str], 66 | baml_options: BamlCallOptions = {}, 67 | ) -> baml_py.HTTPRequest: 68 | __tb__ = baml_options.get("tb", None) 69 | if __tb__ is not None: 70 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 71 | else: 72 | tb = None 73 | __cr__ = baml_options.get("client_registry", None) 74 | 75 | return self.__runtime.build_request_sync( 76 | "SummarizeContent", 77 | { 78 | "content": content,"content_type": content_type,"context": context, 79 | }, 80 | self.__ctx_manager.get(), 81 | tb, 82 | __cr__, 83 | False, 84 | ) 85 | 86 | 87 | 88 | class HttpStreamRequest: 89 | __runtime: baml_py.BamlRuntime 90 | __ctx_manager: baml_py.BamlCtxManager 91 | 92 | def __init__(self, runtime: baml_py.BamlRuntime, ctx_manager: baml_py.BamlCtxManager): 93 | self.__runtime = runtime 94 | self.__ctx_manager = ctx_manager 95 | 96 | 97 | def RouteRequest( 98 | self, 99 | original_message: str, 100 | baml_options: BamlCallOptions = {}, 101 | ) -> baml_py.HTTPRequest: 102 | __tb__ = baml_options.get("tb", None) 103 | if __tb__ is not None: 104 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 105 | else: 106 | tb = None 107 | __cr__ = baml_options.get("client_registry", None) 108 | 109 | return self.__runtime.build_request_sync( 110 | "RouteRequest", 111 | { 112 | "original_message": original_message, 113 | }, 114 | self.__ctx_manager.get(), 115 | tb, 116 | __cr__, 117 | True, 118 | ) 119 | 120 | def SummarizeContent( 121 | self, 122 | content: str,content_type: types.ContentType,context: Optional[str], 123 | baml_options: BamlCallOptions = {}, 124 | ) -> baml_py.HTTPRequest: 125 | __tb__ = baml_options.get("tb", None) 126 | if __tb__ is not None: 127 | tb = __tb__._tb # type: ignore (we know how to use this private attribute) 128 | else: 129 | tb = None 130 | __cr__ = baml_options.get("client_registry", None) 131 | 132 | return self.__runtime.build_request_sync( 133 | "SummarizeContent", 134 | { 135 | "content": content,"content_type": content_type,"context": context, 136 | }, 137 | self.__ctx_manager.get(), 138 | tb, 139 | __cr__, 140 | True, 141 | ) 142 | 143 | 144 | 145 | __all__ = ["HttpRequest", "HttpStreamRequest"] -------------------------------------------------------------------------------- /baml_client/tracing.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX 17 | 18 | trace = DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.trace_fn 19 | set_tags = DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.upsert_tags 20 | def flush(): 21 | DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.flush() 22 | on_log_event = DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_CTX.on_log_event 23 | 24 | 25 | __all__ = ['trace', 'set_tags', "flush", "on_log_event"] -------------------------------------------------------------------------------- /baml_client/type_builder.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | import typing 17 | from baml_py.baml_py import FieldType, EnumValueBuilder, EnumBuilder, ClassBuilder 18 | from baml_py.type_builder import TypeBuilder as _TypeBuilder, ClassPropertyBuilder, ClassPropertyViewer, EnumValueViewer 19 | from .globals import DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME 20 | 21 | 22 | class TypeBuilder(_TypeBuilder): 23 | def __init__(self): 24 | super().__init__(classes=set( 25 | ["Summary",] 26 | ), enums=set( 27 | ["ContentType","ExtractorTool",] 28 | ), runtime=DO_NOT_USE_DIRECTLY_UNLESS_YOU_KNOW_WHAT_YOURE_DOING_RUNTIME) 29 | 30 | 31 | @property 32 | def Summary(self) -> "SummaryAst": 33 | return SummaryAst(self) 34 | 35 | 36 | 37 | 38 | 39 | class SummaryAst: 40 | def __init__(self, tb: _TypeBuilder): 41 | _tb = tb._tb # type: ignore (we know how to use this private attribute) 42 | self._bldr = _tb.class_("Summary") 43 | self._properties: typing.Set[str] = set([ "title", "key_points", "concise_summary", ]) 44 | self._props = SummaryProperties(self._bldr, self._properties) 45 | 46 | def type(self) -> FieldType: 47 | return self._bldr.field() 48 | 49 | @property 50 | def props(self) -> "SummaryProperties": 51 | return self._props 52 | 53 | 54 | class SummaryViewer(SummaryAst): 55 | def __init__(self, tb: _TypeBuilder): 56 | super().__init__(tb) 57 | 58 | 59 | def list_properties(self) -> typing.List[typing.Tuple[str, ClassPropertyViewer]]: 60 | return [(name, ClassPropertyViewer(self._bldr.property(name))) for name in self._properties] 61 | 62 | 63 | 64 | class SummaryProperties: 65 | def __init__(self, bldr: ClassBuilder, properties: typing.Set[str]): 66 | self.__bldr = bldr 67 | self.__properties = properties 68 | 69 | 70 | 71 | @property 72 | def title(self) -> ClassPropertyViewer: 73 | return ClassPropertyViewer(self.__bldr.property("title")) 74 | 75 | @property 76 | def key_points(self) -> ClassPropertyViewer: 77 | return ClassPropertyViewer(self.__bldr.property("key_points")) 78 | 79 | @property 80 | def concise_summary(self) -> ClassPropertyViewer: 81 | return ClassPropertyViewer(self.__bldr.property("concise_summary")) 82 | 83 | 84 | 85 | 86 | 87 | class ContentTypeAst: 88 | def __init__(self, tb: _TypeBuilder): 89 | _tb = tb._tb # type: ignore (we know how to use this private attribute) 90 | self._bldr = _tb.enum("ContentType") 91 | self._values: typing.Set[str] = set([ "Webpage", "PDF", "GenericText", ]) 92 | self._vals = ContentTypeValues(self._bldr, self._values) 93 | 94 | def type(self) -> FieldType: 95 | return self._bldr.field() 96 | 97 | @property 98 | def values(self) -> "ContentTypeValues": 99 | return self._vals 100 | 101 | 102 | class ContentTypeViewer(ContentTypeAst): 103 | def __init__(self, tb: _TypeBuilder): 104 | super().__init__(tb) 105 | 106 | def list_values(self) -> typing.List[typing.Tuple[str, EnumValueViewer]]: 107 | return [(name, EnumValueViewer(self._bldr.value(name))) for name in self._values] 108 | 109 | 110 | class ContentTypeValues: 111 | def __init__(self, enum_bldr: EnumBuilder, values: typing.Set[str]): 112 | self.__bldr = enum_bldr 113 | self.__values = values 114 | 115 | 116 | 117 | @property 118 | def Webpage(self) -> EnumValueViewer: 119 | return EnumValueViewer(self.__bldr.value("Webpage")) 120 | 121 | 122 | @property 123 | def PDF(self) -> EnumValueViewer: 124 | return EnumValueViewer(self.__bldr.value("PDF")) 125 | 126 | 127 | @property 128 | def GenericText(self) -> EnumValueViewer: 129 | return EnumValueViewer(self.__bldr.value("GenericText")) 130 | 131 | 132 | 133 | 134 | class ExtractorToolAst: 135 | def __init__(self, tb: _TypeBuilder): 136 | _tb = tb._tb # type: ignore (we know how to use this private attribute) 137 | self._bldr = _tb.enum("ExtractorTool") 138 | self._values: typing.Set[str] = set([ "WebpageExtractor", "PDFExtractor", "TwitterExtractor", "LinkedInExtractor", "Unsupported", "YoutubeExtractor", ]) 139 | self._vals = ExtractorToolValues(self._bldr, self._values) 140 | 141 | def type(self) -> FieldType: 142 | return self._bldr.field() 143 | 144 | @property 145 | def values(self) -> "ExtractorToolValues": 146 | return self._vals 147 | 148 | 149 | class ExtractorToolViewer(ExtractorToolAst): 150 | def __init__(self, tb: _TypeBuilder): 151 | super().__init__(tb) 152 | 153 | def list_values(self) -> typing.List[typing.Tuple[str, EnumValueViewer]]: 154 | return [(name, EnumValueViewer(self._bldr.value(name))) for name in self._values] 155 | 156 | 157 | class ExtractorToolValues: 158 | def __init__(self, enum_bldr: EnumBuilder, values: typing.Set[str]): 159 | self.__bldr = enum_bldr 160 | self.__values = values 161 | 162 | 163 | 164 | @property 165 | def WebpageExtractor(self) -> EnumValueViewer: 166 | return EnumValueViewer(self.__bldr.value("WebpageExtractor")) 167 | 168 | 169 | @property 170 | def PDFExtractor(self) -> EnumValueViewer: 171 | return EnumValueViewer(self.__bldr.value("PDFExtractor")) 172 | 173 | 174 | @property 175 | def TwitterExtractor(self) -> EnumValueViewer: 176 | return EnumValueViewer(self.__bldr.value("TwitterExtractor")) 177 | 178 | 179 | @property 180 | def LinkedInExtractor(self) -> EnumValueViewer: 181 | return EnumValueViewer(self.__bldr.value("LinkedInExtractor")) 182 | 183 | 184 | @property 185 | def Unsupported(self) -> EnumValueViewer: 186 | return EnumValueViewer(self.__bldr.value("Unsupported")) 187 | 188 | 189 | @property 190 | def YoutubeExtractor(self) -> EnumValueViewer: 191 | return EnumValueViewer(self.__bldr.value("YoutubeExtractor")) 192 | 193 | 194 | 195 | 196 | 197 | __all__ = ["TypeBuilder"] -------------------------------------------------------------------------------- /baml_client/types.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 3 | # Welcome to Baml! To use this generated code, please run the following: 4 | # 5 | # $ pip install baml-py 6 | # 7 | ############################################################################### 8 | 9 | # This file was generated by BAML: please do not edit it. Instead, edit the 10 | # BAML files and re-generate this code. 11 | # 12 | # ruff: noqa: E501,F401 13 | # flake8: noqa: E501,F401 14 | # pylint: disable=unused-import,line-too-long 15 | # fmt: off 16 | import baml_py 17 | from enum import Enum 18 | from pydantic import BaseModel, ConfigDict 19 | from typing_extensions import TypeAlias 20 | from typing import Dict, Generic, List, Literal, Optional, TypeVar, Union 21 | 22 | 23 | T = TypeVar('T') 24 | CheckName = TypeVar('CheckName', bound=str) 25 | 26 | class Check(BaseModel): 27 | name: str 28 | expression: str 29 | status: str 30 | 31 | class Checked(BaseModel, Generic[T,CheckName]): 32 | value: T 33 | checks: Dict[CheckName, Check] 34 | 35 | def get_checks(checks: Dict[CheckName, Check]) -> List[Check]: 36 | return list(checks.values()) 37 | 38 | def all_succeeded(checks: Dict[CheckName, Check]) -> bool: 39 | return all(check.status == "succeeded" for check in get_checks(checks)) 40 | 41 | 42 | 43 | class ContentType(str, Enum): 44 | 45 | Webpage = "Webpage" 46 | PDF = "PDF" 47 | GenericText = "GenericText" 48 | 49 | class ExtractorTool(str, Enum): 50 | 51 | WebpageExtractor = "WebpageExtractor" 52 | PDFExtractor = "PDFExtractor" 53 | TwitterExtractor = "TwitterExtractor" 54 | LinkedInExtractor = "LinkedInExtractor" 55 | Unsupported = "Unsupported" 56 | YoutubeExtractor = "YoutubeExtractor" 57 | 58 | class Summary(BaseModel): 59 | title: str 60 | key_points: List[str] 61 | concise_summary: str 62 | -------------------------------------------------------------------------------- /baml_src/clients.baml: -------------------------------------------------------------------------------- 1 | client Gemini2_5_flash { 2 | provider google-ai 3 | options { 4 | model gemini-2.5-flash-preview-04-17 5 | api_key env.GEMINI_API_KEY 6 | } 7 | } 8 | 9 | client Gemini2_5_pro { 10 | provider google-ai 11 | options { 12 | model gemini-2.5-pro-exp-03-25 13 | api_key env.GEMINI_API_KEY 14 | } 15 | } 16 | 17 | client DeepSeekR1 { 18 | provider "openai" 19 | options { 20 | api_key env.DEEPSEEK_API_KEY 21 | base_url "https://api.deepseek.com" 22 | model "deepseek-reasoner" 23 | } 24 | } 25 | 26 | client DeepSeekV3 { 27 | provider "openai" 28 | options { 29 | api_key env.DEEPSEEK_API_KEY 30 | base_url "https://api.deepseek.com" 31 | model "deepseek-chat" 32 | temperature 0.1 33 | } 34 | } 35 | 36 | // https://docs.boundaryml.com/docs/snippets/clients/fallback 37 | client LLMFallback { 38 | provider fallback 39 | options { 40 | // This will try the clients in order until one succeeds 41 | strategy [DeepSeekV3, Gemini2_5_flash] 42 | } 43 | } 44 | 45 | // https://docs.boundaryml.com/docs/snippets/clients/retry 46 | retry_policy Constant { 47 | max_retries 3 48 | // Strategy is optional 49 | strategy { 50 | type constant_delay 51 | delay_ms 200 52 | } 53 | } 54 | 55 | retry_policy Exponential { 56 | max_retries 2 57 | // Strategy is optional 58 | strategy { 59 | type exponential_backoff 60 | delay_ms 300 61 | multiplier 1.5 62 | max_delay_ms 10000 63 | } 64 | } -------------------------------------------------------------------------------- /baml_src/generators.baml: -------------------------------------------------------------------------------- 1 | // This helps use auto generate libraries you can use in the language of 2 | // your choice. You can have multiple generators if you use multiple languages. 3 | // Just ensure that the output_dir is different for each generator. 4 | generator target { 5 | // Valid values: "python/pydantic", "typescript", "ruby/sorbet", "rest/openapi" 6 | output_type "python/pydantic" 7 | 8 | // Where the generated code will be saved (relative to baml_src/) 9 | output_dir "../" 10 | 11 | // The version of the BAML package you have installed (e.g. same version as your baml-py or @boundaryml/baml). 12 | // The BAML VSCode extension version should also match this version. 13 | version "0.88.0" 14 | 15 | // Valid values: "sync", "async" 16 | // This controls what `b.FunctionName()` will be (sync or async). 17 | default_client_mode sync 18 | } 19 | -------------------------------------------------------------------------------- /baml_src/router.baml: -------------------------------------------------------------------------------- 1 | // Define the possible extraction tools 2 | enum ExtractorTool { 3 | WebpageExtractor // For general webpages 4 | PDFExtractor // For PDF documents 5 | TwitterExtractor // For Twitter/X URLs 6 | LinkedInExtractor // For LinkedIn post URLs 7 | Unsupported // For URLs or content types we cannot handle 8 | YoutubeExtractor // For YouTube video URLs 9 | } 10 | 11 | // Define the router function 12 | // It takes the original message and decides which tool to use. 13 | function RouteRequest(original_message: string) -> ExtractorTool { 14 | // Use a capable but fast client for routing 15 | // client Gemini2_5_flash 16 | client LLMFallback 17 | 18 | prompt #" 19 | Analyze the following user message and determine the best tool to use for extracting content from any URL present. 20 | 21 | User Message: 22 | --- 23 | {{ original_message }} 24 | --- 25 | 26 | Identify the primary URL in the message. Based *only* on the URL's structure or file extension, choose one of the following tools: 27 | 28 | - If the URL points to a PDF file (ends with .pdf), choose PDFExtractor. 29 | - If the URL is from Twitter or X (contains twitter.com or x.com), choose TwitterExtractor. 30 | - If the URL is a LinkedIn post (contains linkedin.com/posts/), choose LinkedInExtractor. 31 | - If the URL is a YouTube video (contains youtube.com/watch or youtu.be/), choose YoutubeExtractor. 32 | - For all other standard web URLs (http or https), choose WebpageExtractor. 33 | - If no URL is found, or the URL type is clearly unsupported (e.g., ftp://, mailto:), choose Unsupported. 34 | 35 | Output *only* the name of the chosen tool from the 'ExtractorTool' enum. 36 | 37 | {{ ctx.output_format }} 38 | "# 39 | } 40 | 41 | // Optional test case 42 | test RouteWebpage { 43 | functions [RouteRequest] 44 | args { 45 | original_message #"Summarize this blog post: https://example.com/blog/article-123"# 46 | } 47 | } 48 | 49 | test RoutePDF { 50 | functions [RouteRequest] 51 | args { 52 | original_message #"Can you process this PDF? https://arxiv.org/pdf/2401.0001.pdf"# 53 | } 54 | } 55 | 56 | test RouteTwitter { 57 | functions [RouteRequest] 58 | args { 59 | original_message #"Look at this thread: https://x.com/user/status/12345"# 60 | } 61 | } 62 | 63 | test RouteLinkedIn { 64 | functions [RouteRequest] 65 | args { 66 | original_message #"Interesting thoughts here: https://www.linkedin.com/posts/someuser_activity-1234567890-abcd?utm_source=share"# 67 | } 68 | } 69 | 70 | test RouteNoURL { 71 | functions [RouteRequest] 72 | args { 73 | original_message #"Tell me a joke."# 74 | } 75 | } 76 | 77 | test RouteUnsupported { 78 | functions [RouteRequest] 79 | args { 80 | original_message #"Check this out: ftp://files.example.com/data.zip"# 81 | } 82 | } 83 | 84 | test RouteYouTube { 85 | functions [RouteRequest] 86 | args { 87 | original_message #"Summarize this video: https://www.youtube.com/watch?v=dQw4w9WgXcQ"# 88 | } 89 | } 90 | 91 | -------------------------------------------------------------------------------- /baml_src/summarize.baml: -------------------------------------------------------------------------------- 1 | 2 | // Define an enum for the type of content being summarized 3 | enum ContentType { 4 | Webpage 5 | PDF 6 | GenericText 7 | } 8 | 9 | // Define the structure for the summarization output 10 | class Summary { 11 | title string @description("A concise and informative title for the summarized content(max 10 words).") 12 | key_points string[] @description("A list of the most important points or takeaways from the content. (3-5 points)") 13 | concise_summary string @description("A brief paragraph summarizing the entire content. (50-100 words)") 14 | } 15 | 16 | // Define the main summarization function 17 | // This function handles shorter texts directly or uses context for RAG-based summaries. 18 | function SummarizeContent(content: string, content_type: ContentType, context: string?) -> Summary { 19 | client LLMFallback 20 | prompt #" 21 | You are an expert summarization engine. Your goal is to provide a clear and concise summary of the given text. 22 | 23 | Content Type: {{ content_type }} 24 | {% if context %} 25 | Relevant Context (from RAG): 26 | --- 27 | {{ context }} 28 | --- 29 | {% endif %} 30 | 31 | Original Content: 32 | --- 33 | {{ content }} 34 | --- 35 | 36 | Based *only* on the provided Original Content {% if context %}and the Relevant Context{% endif %}, generate the answer. 37 | 38 | Format your response strictly as the 'Summary' class structure. Ensure the title, key points, and summary are distinct and accurately reflect the source material. 39 | Do not include any information not present in the provided text or context. 40 | 41 | # Instructions 42 | - If the long is for a paper, you need to explain what the paper is trying to solve and how, in separate sections: 43 | '## What is the problem the paper is trying to solve?' 44 | '## How does the paper attempt to solve the problem?' 45 | - If it's a blog post or webpage, you have to explain like: 'This post or blog or webpage is about ...' 46 | - If it's a github repo, you have to explain like: 'This github repo is about ... and tries to solve .... It uses ...' 47 | - If it's an arxive or any other paper, do not mention info about DIO or under process or stuff like that. Just mentione the main points about the paper. 48 | 49 | ---- 50 | {{ ctx.output_format}} 51 | "# 52 | } 53 | 54 | test SummarizeTest { 55 | functions [SummarizeContent] 56 | args { 57 | content #" 58 | The Urgency of Interpretability 59 | April 2025 60 | In the decade that I have been working on AI, I’ve watched it grow from a tiny academic field to arguably the most important economic and geopolitical issue in the world. In all that time, perhaps the most important lesson I’ve learned is this: the progress of the underlying technology is inexorable, driven by forces too powerful to stop, but the way in which it happens—the order in which things are built, the applications we choose, and the details of how it is rolled out to society—are eminently possible to change, and it’s possible to have great positive impact by doing so. We can’t stop the bus, but we can steer it. In the past I’ve written about the importance of deploying AI in a way that is positive for the world, and of ensuring that democracies build and wield the technology before autocracies do. Over the last few months, I have become increasingly focused on an additional opportunity for steering the bus: the tantalizing possibility, opened up by some recent advances, that we could succeed at interpretability—that is, in understanding the inner workings of AI systems—before models reach an overwhelming level of power. 61 | "# 62 | content_type #"Webpage"# 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /bot.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from contextlib import asynccontextmanager 4 | from dotenv import load_dotenv 5 | import asyncio 6 | import re 7 | import html # <-- Add this import 8 | import json 9 | 10 | from fastapi import FastAPI, Request, Response, HTTPException, Header, APIRouter 11 | import uvicorn 12 | 13 | from telegram import Update 14 | from telegram.ext import Application, MessageHandler, filters, ContextTypes 15 | from telegram.constants import ParseMode 16 | 17 | # Import the agent runner 18 | from agent import run_agent 19 | 20 | # Load environment variables from .env file 21 | load_dotenv(override=True) 22 | 23 | # --- Logging Setup --- 24 | logging.basicConfig( 25 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO 26 | ) 27 | logging.getLogger("httpx").setLevel(logging.WARNING) 28 | logger = logging.getLogger(__name__) 29 | 30 | # --- Constants --- 31 | # Simple regex to find the first URL in a message 32 | URL_REGEX = r"(https?:\/\/[^\s]+)" 33 | 34 | # --- Environment Variables & Constants --- 35 | BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN") 36 | # For Cloud Run, we should use the service URL as the webhook URL 37 | # if not explicitly set through WEBHOOK_URL 38 | WEBHOOK_URL = os.getenv("WEBHOOK_URL") 39 | WEBHOOK_SECRET_PATH = os.getenv("WEBHOOK_SECRET_PATH", "webhook") 40 | 41 | # If we're in Cloud Run, we'll see these environment variables 42 | CLOUD_RUN_SERVICE_URL = os.getenv("K_SERVICE") # Will be set in Cloud Run 43 | 44 | if not BOT_TOKEN: 45 | logger.critical("TELEGRAM_BOT_TOKEN missing. Bot cannot start.") 46 | exit() 47 | 48 | # If we're in Cloud Run but no WEBHOOK_URL is set, use inference 49 | if CLOUD_RUN_SERVICE_URL and not WEBHOOK_URL: 50 | WEBHOOK_URL = f"https://{os.getenv('K_SERVICE')}-{os.getenv('K_REVISION', 'latest')}.{os.getenv('K_REGION', 'unknown')}.run.app" 51 | logger.info(f"Running in Cloud Run, inferred WEBHOOK_URL: {WEBHOOK_URL}") 52 | 53 | if not WEBHOOK_URL: 54 | logger.warning( 55 | "WEBHOOK_URL missing. Webhook setup will be skipped (local testing?)." 56 | ) 57 | 58 | 59 | # --- Global Application Object --- 60 | ptb_app = Application.builder().token(BOT_TOKEN).build() 61 | 62 | 63 | # --- Message Handler --- 64 | async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: 65 | message = update.effective_message 66 | text = message.text 67 | chat_id = message.chat_id 68 | logger.info(f"Received message in chat {chat_id}: {text}") 69 | 70 | # Simple check for URL 71 | if not any(url in text for url in ["http://", "https://"]): 72 | logger.info("Message does not contain a URL, ignoring.") 73 | return 74 | 75 | # Extract the first URL 76 | url_match = re.search(URL_REGEX, text) 77 | if not url_match: 78 | logger.info("No URL found in the message despite initial check. Ignoring.") 79 | return 80 | 81 | extracted_url = url_match.group(0) 82 | logger.info(f"Extracted URL: {extracted_url}") 83 | 84 | try: 85 | # Run the agent 86 | agent_result = await run_agent(text) 87 | 88 | # --- Process Agent Result --- 89 | MAX_LEN = 4096 # Max Telegram message length 90 | 91 | # Agent returns string (summary or error) or None. 92 | # Only proceed if we got a valid summary string (not starting with "Error:"). 93 | if isinstance(agent_result, str) and not agent_result.startswith("Error:"): 94 | logger.info( 95 | f"Agent returned valid summary (len {len(agent_result)} chars). Preparing message." 96 | ) 97 | 98 | # Use agent result directly as the raw text to send (URL removed) 99 | text_to_send_raw = agent_result 100 | 101 | # Escape HTML characters for summary part to prevent parsing errors 102 | text_to_send_formatted = html.escape(agent_result) 103 | 104 | # Send text in chunks if too long 105 | for i in range(0, len(text_to_send_formatted), MAX_LEN): 106 | chunk = text_to_send_formatted[i : i + MAX_LEN] 107 | try: 108 | # Use HTML parse mode as we escaped the summary 109 | # Reply to the original message instead of just sending 110 | await message.reply_text(chunk, parse_mode=ParseMode.HTML) 111 | logger.info(f"Sent chunk {i // MAX_LEN + 1} successfully.") 112 | except Exception as send_err: 113 | logger.error( 114 | f"Failed to send chunk with HTML formatting: {send_err}. Trying plain text." 115 | ) 116 | # Fallback to sending raw chunk without formatting if HTML fails 117 | raw_chunk = text_to_send_raw[i : i + MAX_LEN] 118 | try: 119 | # Reply to the original message instead of just sending 120 | await message.reply_text(raw_chunk) 121 | logger.info( 122 | f"Sent chunk {i // MAX_LEN + 1} successfully (plain text fallback)." 123 | ) 124 | except Exception as plain_send_err: 125 | logger.error( 126 | f"Failed to send chunk even as plain text: {plain_send_err}" 127 | ) 128 | # Stop sending chunks if even plain text fails for one 129 | break 130 | 131 | if i + MAX_LEN < len(text_to_send_formatted): 132 | await asyncio.sleep(0.5) # Small delay between chunks 133 | 134 | # --- Silent Failure Cases --- 135 | elif isinstance(agent_result, str) and agent_result.startswith("Error:"): 136 | # Agent returned an error string 137 | logger.error( 138 | f"Agent failed for {extracted_url}. Error: {agent_result}. Not replying." 139 | ) 140 | # Do nothing in the chat 141 | 142 | else: 143 | # Agent returned None or unexpected type 144 | if agent_result is None: 145 | logger.error(f"Agent returned None for {extracted_url}. Not replying.") 146 | else: 147 | logger.error( 148 | f"Agent returned unexpected result type for {extracted_url}: {type(agent_result)}. Not replying." 149 | ) 150 | # Do nothing in the chat 151 | 152 | except Exception as e: 153 | # --- Main Execution Error --- 154 | # Log the error but do not send anything to the user 155 | logger.error( 156 | f"Unhandled exception processing message for URL {extracted_url}: {e}", 157 | exc_info=True, 158 | ) 159 | # Removed user-facing error reporting 160 | 161 | # Removed the finally block as the thinking_message is gone 162 | 163 | 164 | # --- FastAPI Lifespan Management (Setup/Teardown) --- 165 | @asynccontextmanager 166 | async def lifespan(app: FastAPI): 167 | # --- Startup --- 168 | logger.info("Application startup...") 169 | global ptb_app # Make sure we're modifying the global instance 170 | 171 | should_use_polling = os.getenv("USE_POLLING", "false").lower() == "true" 172 | 173 | logger.info("Initializing PTB application...") 174 | await ptb_app.initialize() 175 | url_handler = MessageHandler(filters.TEXT & (~filters.COMMAND), handle_message) 176 | ptb_app.add_handler(url_handler) 177 | await ptb_app.start() # Start application components (like scheduler, etc.) 178 | 179 | polling_task = None 180 | if should_use_polling: 181 | logger.info( 182 | "Polling mode is active. Starting PTB polling loop in background..." 183 | ) 184 | # Start polling in a background task so it doesn't block Uvicorn 185 | polling_task = asyncio.create_task( 186 | ptb_app.updater.start_polling(poll_interval=1.0) 187 | ) 188 | logger.info("PTB polling loop started.") 189 | 190 | elif WEBHOOK_URL: # Webhook mode 191 | full_webhook_url = ( 192 | f"{WEBHOOK_URL.rstrip('/')}/{WEBHOOK_SECRET_PATH.lstrip('/')}" 193 | ) 194 | logger.info(f"Setting webhook to: {full_webhook_url}") 195 | try: 196 | # ptb_app.start() should have registered the webhook if configured 197 | # Forcing it here to be sure, especially if start() behavior changes 198 | await ptb_app.bot.set_webhook( 199 | url=full_webhook_url, 200 | secret_token=os.getenv("TELEGRAM_WEBHOOK_SECRET_TOKEN"), 201 | allowed_updates=Update.ALL_TYPES, 202 | ) 203 | logger.info("Webhook explicitly set successfully.") 204 | except Exception as e: 205 | logger.error(f"Failed to set webhook: {e}", exc_info=True) 206 | else: # No polling and no WEBHOOK_URL 207 | logger.warning( 208 | "USE_POLLING is false and WEBHOOK_URL not set. Bot may not receive updates." 209 | ) 210 | 211 | app.state.bot_initialized = True 212 | logger.info("Bot initialization complete.") 213 | 214 | yield 215 | 216 | # --- Shutdown --- 217 | logger.info("Application shutdown...") 218 | try: 219 | if polling_task and not polling_task.done(): 220 | logger.info("Polling mode: Stopping PTB polling loop...") 221 | ptb_app.updater.stop() # Request stop 222 | try: 223 | await asyncio.wait_for( 224 | polling_task, timeout=5.0 225 | ) # Wait for task to finish 226 | except asyncio.TimeoutError: 227 | logger.warning("Polling task did not finish in time, cancelling.") 228 | polling_task.cancel() 229 | except Exception as e: 230 | logger.error(f"Error stopping polling task: {e}") 231 | logger.info("PTB polling loop stopped.") 232 | elif ( 233 | WEBHOOK_URL and not should_use_polling 234 | ): # only delete webhook if it was set 235 | logger.info("Webhook mode: Attempting to delete webhook...") 236 | try: 237 | await ptb_app.bot.delete_webhook(drop_pending_updates=True) 238 | logger.info("Webhook deleted successfully.") 239 | except Exception as e: 240 | logger.error(f"Failed to delete webhook: {e}", exc_info=True) 241 | 242 | if ptb_app.running: 243 | await ptb_app.stop() 244 | await ptb_app.shutdown() 245 | logger.info("PTB Application components stopped and shut down.") 246 | except Exception as e: 247 | logger.error(f"Error during PTB application shutdown: {e}", exc_info=True) 248 | 249 | 250 | # --- FastAPI Application Definition --- 251 | app = FastAPI(lifespan=lifespan) 252 | 253 | 254 | # --- Webhook Endpoint --- 255 | @app.post(f"/{WEBHOOK_SECRET_PATH}") 256 | async def webhook( 257 | request: Request, 258 | secret_token: str | None = Header(None, alias="X-Telegram-Bot-Api-Secret-Token"), 259 | ) -> Response: 260 | """Handles incoming Telegram updates via webhook.""" 261 | logger.info("Webhook endpoint called") 262 | 263 | # --- Webhook Secret Token Verification --- 264 | TELEGRAM_WEBHOOK_SECRET_TOKEN = os.getenv("TELEGRAM_WEBHOOK_SECRET_TOKEN") 265 | if TELEGRAM_WEBHOOK_SECRET_TOKEN and secret_token != TELEGRAM_WEBHOOK_SECRET_TOKEN: 266 | logger.warning( 267 | f"Invalid secret token received: '{secret_token}' vs expected token" 268 | ) 269 | raise HTTPException(status_code=403, detail="Invalid secret token") 270 | 271 | # Ensure the bot is initialized before processing updates 272 | if not hasattr(app.state, "bot_initialized") or not app.state.bot_initialized: 273 | logger.error("Bot not yet initialized. Request rejected.") 274 | raise HTTPException(status_code=503, detail="Bot initialization in progress") 275 | 276 | try: 277 | # Get the raw request body for logging if needed 278 | body = await request.body() 279 | logger.info(f"Received webhook request body length: {len(body)} bytes") 280 | 281 | # Parse the request JSON 282 | update_data = await request.json() 283 | logger.info(f"Successfully parsed update JSON") 284 | 285 | # Convert to Telegram Update object 286 | update = Update.de_json(update_data, ptb_app.bot) 287 | logger.info( 288 | f"Received update: {update.update_id}, type: {type(update).__name__}" 289 | ) 290 | 291 | # Extract some basic info for logging 292 | message = update.message or update.edited_message 293 | if message: 294 | logger.info( 295 | f"Message content: '{message.text if message.text else '[no text]'}'" 296 | ) 297 | 298 | # Process the update 299 | # logger.info("Processing update with PTB application...") 300 | # await ptb_app.process_update(update) 301 | # logger.info(f"Successfully processed update {update.update_id}") 302 | 303 | # Kick off processing in the background and ACK Telegram immediately 304 | logger.info("Scheduling background processing...") 305 | asyncio.create_task(ptb_app.process_update(update)) 306 | return {"ok": True} # must be <10 s 307 | 308 | except json.JSONDecodeError as e: 309 | logger.error(f"Failed to parse webhook request JSON: {e}", exc_info=True) 310 | return {"ok": False, "error": "Invalid JSON"} 311 | except Exception as e: 312 | logger.error(f"Error processing update: {e}", exc_info=True) 313 | # Return 200 even on error to prevent Telegram from retrying too aggressively 314 | return {"ok": False, "error": str(e)} 315 | 316 | 317 | # --- Health Check Endpoint (Good Practice) --- 318 | @app.get("/health") 319 | async def health_check(): 320 | """Basic health check endpoint.""" 321 | logger.info("Health check endpoint called.") 322 | return {"status": "ok"} 323 | 324 | 325 | # --- Main Execution Block (for running with uvicorn) --- 326 | if __name__ == "__main__": 327 | host = os.getenv("HOST", "0.0.0.0") 328 | port = int(os.getenv("PORT", "8080")) 329 | 330 | # Check if we should run in polling mode (for local testing without webhook) 331 | use_polling = os.getenv("USE_POLLING", "false").lower() == "true" 332 | 333 | if use_polling: 334 | logger.info("Starting bot in polling mode (from __main__)...") 335 | # This block is mainly for running `python bot.py` directly. 336 | # When running with Uvicorn, the lifespan handler above manages polling. 337 | 338 | # Set a flag that lifespan can check if needed, though direct call is better. 339 | os.environ["_SUPERVISOR_USE_POLLING_MODE"] = "1" 340 | 341 | async def main_polling_directly(): 342 | global ptb_app 343 | logger.info( 344 | "Initializing PTB application for direct polling (main_polling_directly)..." 345 | ) 346 | await ptb_app.initialize() 347 | url_handler = MessageHandler( 348 | filters.TEXT & (~filters.COMMAND), handle_message 349 | ) 350 | ptb_app.add_handler(url_handler) 351 | await ptb_app.start() 352 | logger.info("Starting PTB polling loop (main_polling_directly)...") 353 | try: 354 | await ptb_app.updater.start_polling(poll_interval=1.0) 355 | while True: # Keep alive 356 | await asyncio.sleep(3600) 357 | except KeyboardInterrupt: 358 | logger.info("Polling stopped by user (main_polling_directly).") 359 | finally: 360 | logger.info("Shutting down PTB from main_polling_directly...") 361 | if ptb_app.updater.running: 362 | ptb_app.updater.stop() # stop() is not awaitable here 363 | if ptb_app.running: 364 | await ptb_app.stop() 365 | await ptb_app.shutdown() 366 | logger.info("PTB application shut down after direct polling.") 367 | 368 | asyncio.run(main_polling_directly()) 369 | if "_SUPERVISOR_USE_POLLING_MODE" in os.environ: 370 | del os.environ["_SUPERVISOR_USE_POLLING_MODE"] 371 | else: 372 | # Run in webhook mode with FastAPI/Uvicorn 373 | logger.info(f"Starting Uvicorn server on {host}:{port} for webhook mode...") 374 | uvicorn.run(app, host=host, port=port) 375 | -------------------------------------------------------------------------------- /images/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kargarisaac/telegram_link_summarizer_agent/4d60395aca42e37cca330745b80f4a242419a455/images/image.png -------------------------------------------------------------------------------- /images/system_arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kargarisaac/telegram_link_summarizer_agent/4d60395aca42e37cca330745b80f4a242419a455/images/system_arch.jpg -------------------------------------------------------------------------------- /langgraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": ["."], 3 | "graphs": { 4 | "agent": "./agent.py:graph" 5 | }, 6 | "env": ".env" 7 | } 8 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "telegram-link-summarizer-agent" 3 | version = "0.1.0" 4 | description = "An agentic Telegram bot to summarize links and papers." 5 | readme = "README.md" 6 | requires-python = ">=3.11" # Adjusted to match Docker image 7 | dependencies = [ 8 | "baml-py>=0.88.0", 9 | "langgraph>=0.0.57", 10 | "langchain>=0.2.0", 11 | "langchain-openai>=0.1.7", # Assuming OpenAI, change if needed 12 | "python-telegram-bot[ext]>=21.0", 13 | "requests>=2.31.0", 14 | "pypdf>=4.2.0", 15 | "python-dotenv>=1.0.1", 16 | "tavily-python>=0.3.3", # Adding Tavily as it's in config 17 | # Add any other specific langchain community/experimental packages if used 18 | "langgraph-checkpoint-sqlite>=2.0.6", 19 | "langgraph-cli[inmem]>=0.2.7", 20 | "marimo>=0.13.2", 21 | "langchain-community>=0.3.22", 22 | "rich>=14.0.0", 23 | "loguru>=0.7.3", 24 | "fastapi>=0.115.12", 25 | "uvicorn>=0.34.2", 26 | "pymupdf>=1.25.5", 27 | "click>=8.1.8", 28 | "h11>=0.16.0", 29 | "starlette>=0.46.2", 30 | "pydantic>=2.11.3", 31 | "pydantic-core>=2.33.1", 32 | "typing-extensions>=4.13.2", 33 | "typing-inspection>=0.4.0", 34 | "annotated-types>=0.7.0", 35 | "anyio>=4.9.0", 36 | "nest-asyncio>=1.6.0", 37 | "yt-dlp>=2025.4.30", 38 | "youtube-transcript-api>=1.0.3", 39 | "google-api-python-client>=2.169.0", 40 | "google-cloud-secret-manager>=2.20.0", # Added for GCP Secret Manager access 41 | "bs4>=0.0.2", 42 | "playwright>=1.52.0", 43 | "agentql>=1.10.0", 44 | "beautifulsoup4>=4.13.4", 45 | ] 46 | 47 | [tool.setuptools] 48 | py-modules = ["agent", "bot", "config"] 49 | packages = ["baml_client", "tools"] 50 | -------------------------------------------------------------------------------- /scripts/deploy_cloud_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # --- Deploy the Telegram Summarizer Bot to Google Cloud Run --- 4 | 5 | set -e # Exit immediately if a command exits with a non-zero status. 6 | 7 | # --- Configuration --- 8 | # You can set these environment variables or the script will prompt you. 9 | # PROJECT_ID="your-gcp-project-id" 10 | # REGION="your-preferred-region" # e.g., us-central1 11 | # SERVICE_NAME="telegram-summarizer" 12 | # REPO_NAME="my-summarizer-bot-repo" # Artifact Registry repo name 13 | 14 | # Define the secrets to map from Secret Manager to Cloud Run environment variables. 15 | # Format: "ENV_VAR_NAME_IN_CLOUDRUN=SECRET_NAME_IN_MANAGER:latest" 16 | SECRETS_TO_MAP=( 17 | "GEMINI_API_KEY=GEMINI_API_KEY:latest" 18 | "DEEPSEEK_API_KEY=DEEPSEEK_API_KEY:latest" 19 | "TAVILY_API_KEY=TAVILY_API_KEY:latest" 20 | "TWITTER_API_IO_KEY=TWITTER_API_IO_KEY:latest" 21 | "AGENTQL_API_KEY=AGENTQL_API_KEY:latest" 22 | "TELEGRAM_BOT_TOKEN=TELEGRAM_BOT_TOKEN:latest" 23 | "TELEGRAM_WEBHOOK_SECRET_TOKEN=TELEGRAM_WEBHOOK_SECRET_TOKEN:latest" 24 | "WEBHOOK_SECRET_PATH=WEBHOOK_SECRET_PATH:latest" 25 | ) 26 | 27 | # --- Script Logic --- 28 | 29 | # Check dependencies 30 | if ! command -v gcloud &> /dev/null; then echo "Error: gcloud not found. Please install Google Cloud SDK." >&2; exit 1; fi 31 | if ! command -v docker &> /dev/null; then echo "Error: docker not found. Please install Docker." >&2; exit 1; fi 32 | 33 | # Get configuration if not set via environment variables 34 | PROJECT_ID=${PROJECT_ID:-"$(gcloud config get-value project)"} 35 | if [ -z "${PROJECT_ID}" ]; then read -p "Enter Google Cloud Project ID: " PROJECT_ID; fi 36 | if [ -z "${PROJECT_ID}" ]; then echo "Error: Project ID is required." >&2; exit 1; fi 37 | gcloud config set project "$PROJECT_ID" 38 | 39 | REGION=${REGION:-"$(gcloud config get-value run/region)"} 40 | if [ -z "${REGION}" ]; then read -p "Enter Google Cloud Region (e.g., us-central1): " REGION; fi 41 | if [ -z "${REGION}" ]; then echo "Error: Region is required." >&2; exit 1; fi 42 | gcloud config set run/region "$REGION" 43 | 44 | SERVICE_NAME=${SERVICE_NAME:-"telegram-summarizer"} 45 | read -p "Enter Cloud Run Service Name [${SERVICE_NAME}]: " INPUT_SERVICE_NAME 46 | SERVICE_NAME=${INPUT_SERVICE_NAME:-$SERVICE_NAME} 47 | 48 | REPO_NAME=${REPO_NAME:-"summarizer-bot-repo"} 49 | read -p "Enter Artifact Registry Repository Name [${REPO_NAME}]: " INPUT_REPO_NAME 50 | REPO_NAME=${INPUT_REPO_NAME:-$REPO_NAME} 51 | 52 | # Construct image name 53 | IMAGE_NAME="${REGION}-docker.pkg.dev/${PROJECT_ID}/${REPO_NAME}/${SERVICE_NAME}:latest" 54 | 55 | echo "--- Deployment Configuration ---" 56 | echo "Project ID: $PROJECT_ID" 57 | echo "Region: $REGION" 58 | echo "Service Name: $SERVICE_NAME" 59 | echo "Artifact Repo: $REPO_NAME" 60 | echo "Image Name: $IMAGE_NAME" 61 | echo "------------------------------" 62 | read -p "Proceed with deployment? (y/N): " CONFIRM 63 | if [[ ! "$CONFIRM" =~ ^[Yy]$ ]]; then 64 | echo "Deployment cancelled." 65 | exit 0 66 | fi 67 | 68 | # Get the directory of the script itself 69 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 70 | PROJECT_ROOT="$SCRIPT_DIR/.." 71 | 72 | # Enable APIs 73 | echo "Enabling required Google Cloud APIs..." 74 | gcloud services enable run.googleapis.com artifactregistry.googleapis.com cloudbuild.googleapis.com secretmanager.googleapis.com --project="$PROJECT_ID" 75 | 76 | # Create Artifact Registry Repository 77 | echo "Checking/Creating Artifact Registry repository '$REPO_NAME' in region '$REGION'..." 78 | if ! gcloud artifacts repositories describe "$REPO_NAME" --location="$REGION" --project="$PROJECT_ID" &> /dev/null; then 79 | gcloud artifacts repositories create "$REPO_NAME" \ 80 | --repository-format=docker \ 81 | --location="$REGION" \ 82 | --description="Docker repository for $SERVICE_NAME" \ 83 | --project="$PROJECT_ID" 84 | echo "Created Artifact Registry repository." 85 | else 86 | echo "Artifact Registry repository already exists." 87 | fi 88 | 89 | # Configure Docker Authentication 90 | echo "Configuring Docker authentication for $REGION..." 91 | gcloud auth configure-docker "${REGION}-docker.pkg.dev" --project="$PROJECT_ID" 92 | 93 | # Build the Docker image 94 | echo "Building Docker image '$IMAGE_NAME' from $PROJECT_ROOT..." 95 | cd "$PROJECT_ROOT" || exit 1 96 | docker build -t "$IMAGE_NAME" . 97 | if [ $? -ne 0 ]; then echo "Error: Docker build failed." >&2; exit 1; fi 98 | 99 | # Push the Docker image 100 | echo "Pushing Docker image to Artifact Registry..." 101 | docker push "$IMAGE_NAME" 102 | if [ $? -ne 0 ]; then echo "Error: Docker push failed." >&2; exit 1; fi 103 | 104 | # Construct secrets argument 105 | if [ ${#SECRETS_TO_MAP[@]} -eq 0 ]; then 106 | # This case should not happen with the hardcoded list above 107 | echo "Internal Error: SECRETS_TO_MAP array is empty in script ($0)." >&2 108 | exit 1 109 | fi 110 | SECRETS_ARG=$(printf -- "--set-secrets=%s" "$(IFS=,; echo "${SECRETS_TO_MAP[*]}")") 111 | echo "Will map the following secrets to environment variables in Cloud Run:" 112 | printf " %s\n" "${SECRETS_TO_MAP[@]}" 113 | 114 | # Deploy to Cloud Run 115 | echo "Deploying service '$SERVICE_NAME' to Cloud Run in region '$REGION'..." 116 | 117 | # You can add additional flags here as needed. Example: 118 | # --memory=512Mi # Specify memory 119 | # --cpu=1 # Specify CPU 120 | # --min-instances=0 # Allows scaling to zero (default, so not explicitly required) 121 | # --max-instances=10 # Maximum number of instances 122 | 123 | gcloud run deploy "$SERVICE_NAME" \ 124 | --image="$IMAGE_NAME" \ 125 | --platform=managed \ 126 | --region="$REGION" \ 127 | --port=8080 \ 128 | --allow-unauthenticated \ 129 | --memory=1024Mi \ 130 | --min-instances=1 \ 131 | --cpu-throttling \ 132 | $SECRETS_ARG \ 133 | --project="$PROJECT_ID" 134 | 135 | if [ $? -ne 0 ]; then echo "Error: Cloud Run deployment failed." >&2; exit 1; fi 136 | 137 | # Get the service URL 138 | SERVICE_URL=$(gcloud run services describe "$SERVICE_NAME" --platform managed --region "$REGION" --format 'value(status.url)' --project="$PROJECT_ID") 139 | echo "Service deployed successfully. URL: $SERVICE_URL" 140 | 141 | # --- Set Telegram Webhook --- 142 | echo "Attempting to set Telegram webhook..." 143 | 144 | # Find the secret IDs for the bot token and webhook path from the mapping 145 | TELEGRAM_BOT_TOKEN_SECRET_ID="" 146 | WEBHOOK_SECRET_PATH_SECRET_ID="" 147 | WEBHOOK_SECRET_TOKEN_SECRET_ID="" 148 | 149 | for mapping in "${SECRETS_TO_MAP[@]}"; do 150 | env_var_name=$(echo "$mapping" | cut -d'=' -f1) 151 | secret_ref=$(echo "$mapping" | cut -d'=' -f2) 152 | secret_id=$(echo "$secret_ref" | cut -d':' -f1) 153 | 154 | if [[ "$(echo "$env_var_name" | tr '[:upper:]' '[:lower:]')" == "telegram_bot_token" ]]; then 155 | TELEGRAM_BOT_TOKEN_SECRET_ID="$secret_id" 156 | fi 157 | # Use the env var name expected by bot.py (which matches the secret name here) 158 | if [[ "$(echo "$env_var_name" | tr '[:upper:]' '[:lower:]')" == "webhook_secret_path" ]]; then 159 | WEBHOOK_SECRET_PATH_SECRET_ID="$secret_id" 160 | fi 161 | if [[ "$(echo "$env_var_name" | tr '[:upper:]' '[:lower:]')" == "telegram_webhook_secret_token" ]]; then 162 | WEBHOOK_SECRET_TOKEN_SECRET_ID="$secret_id" 163 | fi 164 | done 165 | 166 | if [ -z "$TELEGRAM_BOT_TOKEN_SECRET_ID" ]; then 167 | echo "Error: Could not find TELEGRAM_BOT_TOKEN mapping in SECRETS_TO_MAP. Cannot set webhook." >&2 168 | exit 1 169 | fi 170 | 171 | if [ -z "$WEBHOOK_SECRET_PATH_SECRET_ID" ]; then 172 | echo "Warning: Could not find WEBHOOK_SECRET_PATH mapping in SECRETS_TO_MAP." >&2 173 | echo "Will attempt to set webhook using default '/webhook' path." >&2 174 | # Default path if not found in secrets 175 | WEBHOOK_PATH_VALUE="/webhook" 176 | else 177 | echo "Fetching Webhook Secret Path from Secret Manager..." 178 | WEBHOOK_PATH_VALUE=$(gcloud secrets versions access latest --secret="$WEBHOOK_SECRET_PATH_SECRET_ID" --project="$PROJECT_ID") 179 | # Ensure path starts with a slash 180 | if [[ "$WEBHOOK_PATH_VALUE" != /* ]]; then 181 | WEBHOOK_PATH_VALUE="/$WEBHOOK_PATH_VALUE" 182 | fi 183 | fi 184 | 185 | # Fetch the latest version of the secrets 186 | echo "Fetching Telegram Bot Token from Secret Manager..." 187 | TELEGRAM_BOT_TOKEN=$(gcloud secrets versions access latest --secret="$TELEGRAM_BOT_TOKEN_SECRET_ID" --project="$PROJECT_ID") 188 | 189 | WEBHOOK_SECRET_TOKEN="" 190 | if [ -n "$WEBHOOK_SECRET_TOKEN_SECRET_ID" ]; then 191 | echo "Fetching Webhook Secret Token from Secret Manager..." 192 | WEBHOOK_SECRET_TOKEN=$(gcloud secrets versions access latest --secret="$WEBHOOK_SECRET_TOKEN_SECRET_ID" --project="$PROJECT_ID") 193 | fi 194 | 195 | # Construct webhook URL 196 | FINAL_WEBHOOK_URL="${SERVICE_URL}${WEBHOOK_PATH_VALUE}" 197 | 198 | echo "Setting webhook to: $FINAL_WEBHOOK_URL" 199 | 200 | # Use curl to set the webhook 201 | API_URL="https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook" 202 | 203 | echo "DEBUG: Preparing curl command..." 204 | 205 | # Construct the complete JSON payload in one string 206 | if [ -n "$WEBHOOK_SECRET_TOKEN" ]; then 207 | echo "Using webhook secret token." 208 | JSON_PAYLOAD="{\"url\": \"$FINAL_WEBHOOK_URL\", \"secret_token\": \"$WEBHOOK_SECRET_TOKEN\"}" 209 | else 210 | echo "No webhook secret token found/used." 211 | JSON_PAYLOAD="{\"url\": \"$FINAL_WEBHOOK_URL\"}" 212 | fi 213 | 214 | echo "DEBUG: JSON Payload: $JSON_PAYLOAD" 215 | 216 | # Function to make the curl request with retries 217 | set_webhook_with_retry() { 218 | local max_retries=3 219 | local retry_count=0 220 | local wait_time=2 221 | 222 | while [ $retry_count -lt $max_retries ]; do 223 | echo "DEBUG: Setting webhook (attempt $((retry_count + 1))/$max_retries)..." 224 | 225 | # Make the request 226 | RESPONSE=$(curl -s -X POST "$API_URL" \ 227 | -H "Content-Type: application/json" \ 228 | -d "$JSON_PAYLOAD") 229 | 230 | CURL_EXIT_CODE=$? 231 | 232 | # Check for rate limit error 233 | if [ $CURL_EXIT_CODE -eq 0 ] && echo "$RESPONSE" | grep -q '"error_code":429'; then 234 | retry_after=$(echo "$RESPONSE" | grep -o '"retry_after":[0-9]*' | grep -o '[0-9]*') 235 | 236 | # If retry_after is not found or not a number, use default wait time 237 | if [ -z "$retry_after" ] || ! [[ "$retry_after" =~ ^[0-9]+$ ]]; then 238 | retry_after=$wait_time 239 | fi 240 | 241 | echo "Rate limited by Telegram API. Waiting ${retry_after}s before retry..." 242 | sleep $((retry_after + 1)) # Wait a bit longer than recommended 243 | retry_count=$((retry_count + 1)) 244 | continue 245 | fi 246 | 247 | # If we get here, either there was no rate limit error or another error occurred 248 | break 249 | done 250 | 251 | return $CURL_EXIT_CODE 252 | } 253 | 254 | # Call the function to make the request with retries 255 | set_webhook_with_retry 256 | CURL_EXIT_CODE=$? 257 | 258 | # For debugging, let's see the response 259 | echo "DEBUG: Webhook response: $RESPONSE" 260 | 261 | # Check response from Telegram API 262 | if [ $CURL_EXIT_CODE -ne 0 ]; then 263 | echo "Error: curl command failed with exit code $CURL_EXIT_CODE" >&2 264 | exit 1 265 | elif echo "$RESPONSE" | grep -q '"ok":true'; then 266 | echo "Telegram webhook set successfully!" 267 | echo "Result: $RESPONSE" 268 | elif echo "$RESPONSE" | grep -q '"description":"Webhook is already set"'; then 269 | # This is also a success case, webhook is properly set 270 | echo "Telegram webhook was already set to this URL." 271 | echo "Result: $RESPONSE" 272 | else 273 | echo "Error setting Telegram webhook." >&2 274 | echo "URL used: $FINAL_WEBHOOK_URL" >&2 275 | echo "Check TELEGRAM_BOT_TOKEN, WEBHOOK_SECRET_PATH (if used), and ensure the service URL is correct and publicly accessible." >&2 276 | echo "Telegram API Response: $RESPONSE" >&2 277 | exit 1 278 | fi 279 | 280 | echo "--- Deployment Complete ---" 281 | -------------------------------------------------------------------------------- /scripts/deploy_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # --- Deploy Telegram Summarizer Bot to Self-Managed Server --- 4 | 5 | set -e # Exit immediately if a command exits with a non-zero status. 6 | 7 | # --- Configuration --- 8 | SERVER_IP=${SERVER_IP:-"38.54.75.29"} 9 | CONTAINER_NAME=${CONTAINER_NAME:-"telegram-summarizer"} 10 | IMAGE_NAME=${IMAGE_NAME:-"telegram-summarizer:latest"} 11 | HOST_PORT=${HOST_PORT:-"8080"} 12 | CONTAINER_PORT=${CONTAINER_PORT:-"8080"} 13 | 14 | echo "--- Server Deployment Configuration ---" 15 | echo "Server IP: $SERVER_IP" 16 | echo "Container Name: $CONTAINER_NAME" 17 | echo "Image Name: $IMAGE_NAME" 18 | echo "Host Port: $HOST_PORT" 19 | echo "Container Port: $CONTAINER_PORT" 20 | echo "----------------------------------------" 21 | 22 | # Get the directory of the script itself 23 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 24 | PROJECT_ROOT="$SCRIPT_DIR/.." 25 | 26 | # Check if .env file exists 27 | if [ ! -f "$PROJECT_ROOT/.env" ]; then 28 | echo "Error: .env file not found in project root. Please create it with your environment variables." >&2 29 | exit 1 30 | fi 31 | 32 | echo "Found .env file. Proceeding with deployment..." 33 | 34 | # Stop and remove existing container if it exists 35 | echo "Stopping and removing existing container (if any)..." 36 | docker stop "$CONTAINER_NAME" 2>/dev/null || true 37 | docker rm "$CONTAINER_NAME" 2>/dev/null || true 38 | 39 | # Remove old image to ensure we build fresh 40 | echo "Removing old image (if any)..." 41 | docker rmi "$IMAGE_NAME" 2>/dev/null || true 42 | 43 | # Build the Docker image 44 | echo "Building Docker image '$IMAGE_NAME'..." 45 | cd "$PROJECT_ROOT" || exit 1 46 | docker build -t "$IMAGE_NAME" . 47 | if [ $? -ne 0 ]; then 48 | echo "Error: Docker build failed." >&2 49 | exit 1 50 | fi 51 | 52 | # Run the container 53 | echo "Starting container '$CONTAINER_NAME'..." 54 | docker run -d \ 55 | --name "$CONTAINER_NAME" \ 56 | --restart unless-stopped \ 57 | -p "$HOST_PORT:$CONTAINER_PORT" \ 58 | --env-file .env \ 59 | "$IMAGE_NAME" 60 | 61 | if [ $? -ne 0 ]; then 62 | echo "Error: Failed to start container." >&2 63 | exit 1 64 | fi 65 | 66 | echo "Container started successfully!" 67 | 68 | # Wait a moment for the container to start 69 | sleep 5 70 | 71 | # Check container status 72 | echo "Checking container status..." 73 | docker ps | grep "$CONTAINER_NAME" || { 74 | echo "Error: Container is not running. Checking logs..." 75 | docker logs "$CONTAINER_NAME" 76 | exit 1 77 | } 78 | 79 | # Check health endpoint 80 | echo "Checking health endpoint..." 81 | sleep 10 # Give the app time to start 82 | if curl -f "http://localhost:$HOST_PORT/health" >/dev/null 2>&1; then 83 | echo "✅ Health check passed!" 84 | else 85 | echo "⚠️ Health check failed. Checking logs..." 86 | docker logs --tail 20 "$CONTAINER_NAME" 87 | fi 88 | 89 | # Set Telegram webhook 90 | echo "Setting Telegram webhook..." 91 | if [ -f "$PROJECT_ROOT/.env" ]; then 92 | # Source the .env file to get variables 93 | set -a # automatically export all variables 94 | source "$PROJECT_ROOT/.env" 95 | set +a # stop automatically exporting 96 | 97 | if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$WEBHOOK_URL" ] && [ -n "$WEBHOOK_SECRET_PATH" ]; then 98 | FULL_WEBHOOK_URL="${WEBHOOK_URL}${WEBHOOK_SECRET_PATH}" 99 | echo "Setting webhook to: $FULL_WEBHOOK_URL" 100 | 101 | # Prepare JSON payload 102 | if [ -n "$TELEGRAM_WEBHOOK_SECRET_TOKEN" ]; then 103 | JSON_PAYLOAD="{\"url\": \"$FULL_WEBHOOK_URL\", \"secret_token\": \"$TELEGRAM_WEBHOOK_SECRET_TOKEN\"}" 104 | else 105 | JSON_PAYLOAD="{\"url\": \"$FULL_WEBHOOK_URL\"}" 106 | fi 107 | 108 | # Set webhook 109 | RESPONSE=$(curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook" \ 110 | -H "Content-Type: application/json" \ 111 | -d "$JSON_PAYLOAD") 112 | 113 | if echo "$RESPONSE" | grep -q '"ok":true'; then 114 | echo "✅ Telegram webhook set successfully!" 115 | else 116 | echo "⚠️ Failed to set Telegram webhook. Response: $RESPONSE" 117 | fi 118 | else 119 | echo "⚠️ Missing webhook configuration in .env file. Please set webhook manually." 120 | fi 121 | fi 122 | 123 | echo "" 124 | echo "--- Deployment Complete ---" 125 | echo "Container: $CONTAINER_NAME" 126 | echo "Status: $(docker inspect -f '{{.State.Status}}' $CONTAINER_NAME)" 127 | echo "Logs: docker logs $CONTAINER_NAME" 128 | echo "Stop: docker stop $CONTAINER_NAME" 129 | echo "Restart: docker restart $CONTAINER_NAME" 130 | echo "" 131 | echo "Your bot should now be accessible at: http://$SERVER_IP:$HOST_PORT" 132 | echo "Health check: http://$SERVER_IP:$HOST_PORT/health" -------------------------------------------------------------------------------- /scripts/run_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # --- Build and Run the Telegram Summarizer Bot using Docker --- 4 | 5 | IMAGE_NAME="telegram-summarizer" 6 | CONTAINER_NAME="summarizer-bot" 7 | 8 | # Get the directory of the script itself 9 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 10 | # Go one level up to the project root 11 | PROJECT_ROOT="$SCRIPT_DIR/.." 12 | 13 | ENV_FILE="$PROJECT_ROOT/.env" 14 | 15 | if [ ! -f "$ENV_FILE" ]; then 16 | echo "Error: .env file not found at $ENV_FILE" >&2 17 | echo "Please create and configure the .env file before running." >&2 18 | exit 1 19 | fi 20 | 21 | echo "Building the Docker image ($IMAGE_NAME)..." 22 | cd "$PROJECT_ROOT" || exit 1 23 | docker build -t "$IMAGE_NAME" . 24 | 25 | if [ $? -ne 0 ]; then 26 | echo "Error: Docker build failed." >&2 27 | exit 1 28 | fi 29 | 30 | echo "Stopping and removing existing container named '$CONTAINER_NAME' (if any)..." 31 | docker stop "$CONTAINER_NAME" > /dev/null 2>&1 32 | docker rm "$CONTAINER_NAME" > /dev/null 2>&1 33 | 34 | echo "Running the Docker container ($CONTAINER_NAME) with .env file..." 35 | echo "Access the health check at http://localhost:8080/health" 36 | 37 | docker run -p 8080:8080 --rm --name "$CONTAINER_NAME" --env-file "$ENV_FILE" "$IMAGE_NAME" 38 | 39 | if [ $? -ne 0 ]; then 40 | echo "Error: Failed to run Docker container." >&2 41 | exit 1 42 | fi 43 | -------------------------------------------------------------------------------- /scripts/run_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # --- Run the Telegram Summarizer Bot Locally (without Docker) --- 4 | 5 | echo "Starting the Telegram Summarizer Bot using uvicorn..." 6 | echo "Ensure you have installed dependencies using 'uv sync'" 7 | echo "Ensure your .env file is configured in the project root." 8 | 9 | # Get the directory of the script itself 10 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 11 | # Go one level up to the project root 12 | PROJECT_ROOT="$SCRIPT_DIR/.." 13 | 14 | # Run uvicorn from the project root 15 | cd "$PROJECT_ROOT" || exit 1 16 | uvicorn bot:app --host 0.0.0.0 --port 8080 --reload 17 | -------------------------------------------------------------------------------- /scripts/setup_secrets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # --- Setup Secrets in Google Cloud Secret Manager --- 4 | 5 | # Define the specific secrets used by this application 6 | SECRETS=( 7 | "GEMINI_API_KEY" 8 | "DEEPSEEK_API_KEY" 9 | "TAVILY_API_KEY" 10 | "TWITTER_API_IO_KEY" 11 | "AGENTQL_API_KEY" 12 | "TELEGRAM_BOT_TOKEN" 13 | "TELEGRAM_WEBHOOK_SECRET_TOKEN" 14 | "WEBHOOK_SECRET_PATH" 15 | ) 16 | 17 | # --- You shouldn't need to edit below this line --- 18 | 19 | # Check if gcloud is installed 20 | if ! command -v gcloud &> /dev/null; then 21 | echo "Error: gcloud command not found. Please install the Google Cloud SDK." >&2 22 | exit 1 23 | fi 24 | 25 | # Get PROJECT_ID if not set 26 | if [ -z "${PROJECT_ID}" ]; then 27 | read -p "Enter your Google Cloud Project ID: " PROJECT_ID 28 | if [ -z "${PROJECT_ID}" ]; then 29 | echo "Error: Project ID cannot be empty." >&2 30 | exit 1 31 | fi 32 | export PROJECT_ID 33 | gcloud config set project "$PROJECT_ID" 34 | fi 35 | 36 | echo "Using Project ID: $PROJECT_ID" 37 | 38 | echo "Enabling Secret Manager API (if not already enabled)..." 39 | gcloud services enable secretmanager.googleapis.com --project="$PROJECT_ID" 40 | 41 | # --- Grant Secret Accessor Role to Default Compute Service Account --- 42 | echo "Fetching Project Number for $PROJECT_ID..." 43 | PROJECT_NUMBER=$(gcloud projects describe "$PROJECT_ID" --format='value(projectNumber)') 44 | 45 | if [ -z "$PROJECT_NUMBER" ]; then 46 | echo "Error: Could not fetch Project Number for Project ID $PROJECT_ID." >&2 47 | echo "Please ensure the Project ID is correct and you have permissions." >&2 48 | exit 1 49 | fi 50 | 51 | SERVICE_ACCOUNT_EMAIL="${PROJECT_NUMBER}-compute@developer.gserviceaccount.com" 52 | ROLE_TO_GRANT="roles/secretmanager.secretAccessor" 53 | 54 | echo "Checking if service account $SERVICE_ACCOUNT_EMAIL has role $ROLE_TO_GRANT..." 55 | # Check current policy binding (suppress errors if role isn't found) 56 | if ! gcloud projects get-iam-policy "$PROJECT_ID" \ 57 | --flatten="bindings[].members" \ 58 | --format='table(bindings.role)' \ 59 | --filter="bindings.members:$SERVICE_ACCOUNT_EMAIL AND bindings.role:$ROLE_TO_GRANT" 2>/dev/null | grep -q "$ROLE_TO_GRANT"; then 60 | 61 | echo "Granting '$ROLE_TO_GRANT' to service account '$SERVICE_ACCOUNT_EMAIL' on project '$PROJECT_ID'..." 62 | gcloud projects add-iam-policy-binding "$PROJECT_ID" \ 63 | --member="serviceAccount:$SERVICE_ACCOUNT_EMAIL" \ 64 | --role="$ROLE_TO_GRANT" \ 65 | --condition=None # Explicitly setting no condition 66 | 67 | if [ $? -ne 0 ]; then 68 | echo "Error: Failed to grant IAM role $ROLE_TO_GRANT to $SERVICE_ACCOUNT_EMAIL." >&2 69 | echo "Please check permissions or grant the role manually via the Google Cloud Console." >&2 70 | # Decide if you want to exit or continue 71 | # exit 1 72 | else 73 | echo "IAM role granted successfully." 74 | fi 75 | else 76 | echo "Service account already has the required role." 77 | fi 78 | # --- End Grant Role --- 79 | 80 | 81 | if [ ${#SECRETS[@]} -eq 0 ]; then 82 | echo "Internal Error: SECRETS array is empty in script ($0)." >&2 83 | exit 1 84 | fi 85 | 86 | 87 | for SECRET_NAME in "${SECRETS[@]}"; do 88 | echo "-------------------------------------" 89 | echo "Processing Secret: $SECRET_NAME" 90 | 91 | # Check if secret exists 92 | if gcloud secrets describe "$SECRET_NAME" --project="$PROJECT_ID" &> /dev/null; then 93 | echo "Secret '$SECRET_NAME' already exists." 94 | read -p "Do you want to add a new version with a new value? (y/N): " ADD_VERSION_CONFIRM 95 | if [[ "$ADD_VERSION_CONFIRM" =~ ^[Yy]$ ]]; then 96 | # Add new version 97 | # Prompt for the secret value without echoing to the terminal 98 | echo -n "Enter the new value for secret '$SECRET_NAME': " 99 | read -s SECRET_VALUE 100 | echo # Add a newline after reading the secret 101 | if [ -z "$SECRET_VALUE" ]; then 102 | echo "Warning: Secret value is empty. Skipping adding new version for '$SECRET_NAME'." >&2 103 | else 104 | printf "%s" "$SECRET_VALUE" | gcloud secrets versions add "$SECRET_NAME" --data-file=- --project="$PROJECT_ID" 105 | echo "Added new version to secret '$SECRET_NAME'." 106 | fi 107 | else 108 | echo "Skipping secret '$SECRET_NAME'." 109 | fi 110 | else 111 | # Create secret 112 | echo "Secret '$SECRET_NAME' does not exist. Creating it..." 113 | gcloud secrets create "$SECRET_NAME" --replication-policy="automatic" --project="$PROJECT_ID" 114 | if [ $? -ne 0 ]; then 115 | echo "Error: Failed to create secret '$SECRET_NAME'." >&2 116 | continue # Skip to the next secret 117 | fi 118 | echo "Created secret '$SECRET_NAME'." 119 | 120 | # Add the first version 121 | # Prompt for the secret value without echoing to the terminal 122 | echo -n "Enter the value for secret '$SECRET_NAME': " 123 | read -s SECRET_VALUE 124 | echo 125 | if [ -z "$SECRET_VALUE" ]; then 126 | echo "Warning: Secret value is empty. Creating secret '$SECRET_NAME' with no initial version." >&2 127 | else 128 | printf "%s" "$SECRET_VALUE" | gcloud secrets versions add "$SECRET_NAME" --data-file=- --project="$PROJECT_ID" 129 | echo "Added initial version to secret '$SECRET_NAME'." 130 | fi 131 | fi 132 | done 133 | 134 | echo "-------------------------------------" 135 | echo "Secret setup process complete." 136 | echo "Remember to grant your Cloud Run service account (PROJECT_NUMBER-compute@developer.gserviceaccount.com) the 'Secret Manager Secret Accessor' role for these secrets." 137 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kargarisaac/telegram_link_summarizer_agent/4d60395aca42e37cca330745b80f4a242419a455/tools/__init__.py -------------------------------------------------------------------------------- /tools/linkedin_agentql_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | linkedin_agentql_scraper.py 3 | 4 | Scrape a public LinkedIn post’s author name and full body text using Playwright + AgentQL. 5 | 6 | ⚠️ Limitations 7 | * Works only for *public* posts (i.e. visible to signed‑out visitors). 8 | * For private / connection‑only posts, you must authenticate first. 9 | 10 | Prerequisites: 11 | pip install playwright agentql 12 | playwright install 13 | export AGENTQL_API_KEY= 14 | 15 | Usage: 16 | python linkedin_agentql_scraper.py --url "https://www.linkedin.com/posts/..." [--headless] 17 | 18 | The script prints a JSON‑style dict with keys ``author`` and ``content``. 19 | """ 20 | 21 | from __future__ import annotations 22 | 23 | import argparse 24 | import os 25 | import textwrap 26 | from dotenv import load_dotenv 27 | import agentql 28 | from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError 29 | 30 | load_dotenv() 31 | 32 | 33 | def block_resources(route): 34 | if route.request.resource_type in ["image", "stylesheet", "font"]: 35 | route.abort() 36 | else: 37 | route.continue_() 38 | 39 | 40 | def scrape_linkedin_post(url: str, headless: bool = True) -> dict[str, str]: 41 | """Return the post’s author name and full text content.""" 42 | 43 | # 0. Configure AgentQL 44 | agentql.configure(api_key=os.getenv("AGENTQL_API_KEY", "")) 45 | 46 | with sync_playwright() as p: 47 | browser = p.chromium.launch( 48 | headless=headless, 49 | args=["--no-sandbox"], 50 | ) 51 | page = agentql.wrap(browser.new_page()) 52 | 53 | # 1. Navigate & wait for DOM ready 54 | try: 55 | # Increased timeout to 60 seconds 56 | page.route("**/*", block_resources) 57 | page.goto(url, wait_until="domcontentloaded", timeout=60000) 58 | except PlaywrightTimeoutError as e: 59 | print(f"Timeout during page.goto: {e}") 60 | try: 61 | # Attempt to save a screenshot for debugging 62 | screenshot_path = "linkedin_timeout_screenshot.png" 63 | page.screenshot(path=screenshot_path) 64 | print(f"Screenshot saved to {screenshot_path}") 65 | except Exception as se: 66 | print(f"Failed to save screenshot: {se}") 67 | browser.close() # Ensure browser is closed on error 68 | raise # Re-raise the original timeout error 69 | except Exception as e: 70 | print(f"An unexpected error occurred during page.goto: {e}") 71 | browser.close() # Ensure browser is closed on error 72 | raise # Re-raise the original error 73 | 74 | page.wait_for_page_ready_state() 75 | 76 | # 2. Accept cookies / privacy banner if shown (EU visitors) 77 | try: 78 | banner = page.query_elements( 79 | """ 80 | { 81 | accept_cookies_btn 82 | } 83 | """ 84 | ) 85 | banner.accept_cookies_btn.click(timeout=3000) 86 | except Exception: 87 | # Fallback Locator 88 | try: 89 | page.locator("button:has-text('Accept cookies')").click(timeout=3000) 90 | except PlaywrightTimeoutError: 91 | pass # No banner 92 | 93 | # 3. Expand "…see more" inside the post body (if truncated) 94 | try: 95 | more = page.query_elements( 96 | """ 97 | { 98 | expand_post_body_btn 99 | } 100 | """ 101 | ) 102 | more.expand_post_body_btn.click(timeout=3000) 103 | except Exception: 104 | try: 105 | page.locator("button:has-text('see more')").first.click(timeout=3000) 106 | except PlaywrightTimeoutError: 107 | pass 108 | 109 | # 4. Extract author & content via AgentQL 110 | data = page.query_data( 111 | """ 112 | { 113 | author_name 114 | post_body_text 115 | } 116 | """ 117 | ) 118 | 119 | browser.close() 120 | 121 | return { 122 | "author": data.get("author_name", ""), 123 | "content": textwrap.dedent(data.get("post_body_text", "")).strip(), 124 | } 125 | 126 | 127 | if __name__ == "__main__": 128 | parser = argparse.ArgumentParser( 129 | description="Scrape a LinkedIn post via Playwright + AgentQL. Works for public posts only." 130 | ) 131 | parser.add_argument("--url", required=True, help="Public LinkedIn post URL") 132 | parser.add_argument( 133 | "--headless", 134 | action="store_true", 135 | help="Run browser in headless mode (default: GUI).", 136 | ) 137 | args = parser.parse_args() 138 | 139 | result = scrape_linkedin_post(args.url, headless=args.headless) 140 | print("\n=== RESULT ===") 141 | print("Author:", result["author"]) 142 | print("\nPost text:\n", result["content"]) 143 | -------------------------------------------------------------------------------- /tools/pdf_handler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import fitz # PyMuPDF 3 | 4 | def get_pdf_text(url: str) -> str: 5 | """Downloads a PDF from a URL and extracts its text content. 6 | 7 | Args: 8 | url: The URL of the PDF file. 9 | 10 | Returns: 11 | The extracted text content of the PDF. 12 | Returns an error message string if download or processing fails. 13 | """ 14 | try: 15 | response = requests.get(url, stream=True, timeout=30) # Add timeout 16 | response.raise_for_status() # Raise an exception for bad status codes 17 | 18 | # Check content type to ensure it's a PDF before downloading fully 19 | content_type = response.headers.get('Content-Type', '').lower() 20 | if 'application/pdf' not in content_type: 21 | return f"Error: URL does not point to a PDF file (Content-Type: {content_type})" 22 | 23 | # Read the content into memory 24 | pdf_content = response.content 25 | pdf_document = fitz.open(stream=pdf_content, filetype="pdf") 26 | 27 | text = "" 28 | for page_num in range(len(pdf_document)): 29 | page = pdf_document.load_page(page_num) 30 | text += page.get_text() 31 | 32 | pdf_document.close() 33 | return text 34 | 35 | except requests.exceptions.RequestException as e: 36 | return f"Error downloading PDF: {e}" 37 | except fitz.errors.FitzError as e: # Catch PyMuPDF specific errors 38 | return f"Error processing PDF: {e}" 39 | except Exception as e: 40 | return f"An unexpected error occurred: {e}" 41 | 42 | # Example usage (optional, for testing): 43 | if __name__ == '__main__': 44 | # Replace with a valid PDF URL for testing 45 | test_url = "https://arxiv.org/pdf/1706.03762.pdf" # Example: Attention is All You Need paper 46 | extracted_text = get_pdf_text(test_url) 47 | if extracted_text.startswith("Error:"): 48 | print(extracted_text) 49 | else: 50 | print("Successfully extracted text:") 51 | # Print first 500 characters as a sample 52 | print(extracted_text[:500] + "...") 53 | -------------------------------------------------------------------------------- /tools/search.py: -------------------------------------------------------------------------------- 1 | import os 2 | from rich.console import Console 3 | from tavily import TavilyClient 4 | from dotenv import load_dotenv 5 | 6 | # Load environment variables from .env file 7 | load_dotenv(override=True) 8 | 9 | console = Console() 10 | 11 | # --- Tavily Client Initialization --- 12 | TAVILY_API_KEY: str | None = os.getenv("TAVILY_API_KEY") 13 | tavily_client = None 14 | if TAVILY_API_KEY: 15 | try: 16 | # Initialize TavilyClient 17 | tavily_client = TavilyClient(api_key=TAVILY_API_KEY) 18 | console.print("Tavily client initialized in tools/search.py.", style="bold green") 19 | except Exception as e: 20 | console.print(f"Failed to initialize Tavily client: {e}", style="bold red", exc_info=True) 21 | else: 22 | console.print("TAVILY_API_KEY not found in config. Tavily tool disabled.", style="bold yellow") 23 | 24 | def run_tavily_tool(mode: str, query: str = None, urls: list[str] = None, **kwargs) -> dict | str: 25 | """ 26 | Uses Tavily client to perform search or extract content from URLs. 27 | 28 | Args: 29 | mode: The operation mode ('search' or 'extract'). 30 | query: The search query (required for 'search' mode). 31 | urls: A list of URLs to extract content from (required for 'extract' mode). 32 | **kwargs: Additional parameters for the Tavily API (e.g., max_results, topic, time_range for search). 33 | 34 | Returns: 35 | A dictionary containing the results (search or extract), or an error string. 36 | """ 37 | if not tavily_client: 38 | console.print("Tavily client not initialized. Cannot perform operation.", style="bold red") 39 | return "Error: Tavily client is not available. Check API Key." 40 | 41 | try: 42 | if mode == 'search': 43 | if not query: 44 | return "Error: Query is required for search mode." 45 | results = tavily_client.search(query=query, **kwargs) 46 | console.print(f"Tavily search successful for query: '{query}'", style="green") 47 | 48 | elif mode == 'extract': 49 | if not urls: 50 | return "Error: URLs are required for extract mode." 51 | results = tavily_client.extract(urls=urls, **kwargs) 52 | console.print(f"Tavily extract successful for URLs: {urls}", style="green") 53 | 54 | else: 55 | return f"Error: Invalid mode '{mode}'. Use 'search' or 'extract'." 56 | 57 | if not results: 58 | console.print(f"Tavily {mode} returned no results.", style="bold yellow") 59 | return f"Error: Tavily {mode} found no information." 60 | 61 | # The SDK returns a dictionary directly 62 | return results 63 | 64 | except Exception as e: 65 | console.print(f"Tavily {mode} failed: {e}", style="bold red", exc_info=True) 66 | return f"Error: Tavily {mode} encountered an error. {e}" 67 | 68 | # Example for testing (optional) 69 | if __name__ == "__main__": 70 | # --- Test Search --- 71 | # test_query = "Find the recent blog post from Dario Amodei about AI Interpretability" 72 | # console.print(f"\n--- Testing Tavily Search for: '{test_query}' ---", style="bold blue") 73 | # search_results = run_tavily_tool(mode='search', query=test_query, topic="news", max_results=3) 74 | # console.print("Search results:", style="bold green") 75 | # console.print(search_results) 76 | # console.print("--- End Search Test ---", style="bold blue") 77 | 78 | # --- Test Extract --- 79 | # Example URLs (replace with valid ones if needed) 80 | test_urls_for_extract = [ 81 | "https://www.darioamodei.com/post/the-urgency-of-interpretability" 82 | ] 83 | console.print(f"\n--- Testing Tavily Extract for URLs: {test_urls_for_extract} ---", style="bold blue") 84 | extract_results = run_tavily_tool(mode='extract', urls=test_urls_for_extract) 85 | console.print("Extract results:", style="bold green") 86 | console.print(extract_results) 87 | console.print("--- End Extract Test ---", style="bold blue") 88 | -------------------------------------------------------------------------------- /tools/twitter_api_tool.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import requests 4 | from datetime import datetime, timezone 5 | from dotenv import load_dotenv 6 | from rich.console import Console 7 | 8 | load_dotenv() 9 | console = Console() 10 | 11 | 12 | def _parse_twitter_datetime(datetime_str: str) -> datetime: 13 | """Parses Twitter's datetime string into a timezone-aware datetime object.""" 14 | # Format: 'Thu May 01 12:03:30 +0000 2025' 15 | # Need to handle the +0000 timezone correctly 16 | try: 17 | # Standard format doesn't handle '+0000' directly as %z prior to Python 3.7/3.11 depending on platform? 18 | # Let's parse manually or use a robust library if needed. 19 | # For simplicity, assuming UTC (+0000) 20 | dt_naive = datetime.strptime(datetime_str, "%a %b %d %H:%M:%S +0000 %Y") 21 | return dt_naive.replace(tzinfo=timezone.utc) 22 | except ValueError: 23 | console.print(f"Error parsing datetime string: {datetime_str}", style="red") 24 | # Return epoch as a fallback to allow sorting even if parsing fails 25 | return datetime.fromtimestamp(0, tz=timezone.utc) 26 | 27 | 28 | def fetch_tweet_thread(url: str) -> str: 29 | """ 30 | Fetches the content of a tweet and its potential thread using twitterapi.io. 31 | 32 | Args: 33 | url: The URL of the tweet. 34 | 35 | Returns: 36 | A string containing the formatted tweet thread, or an error message starting with "Error:". 37 | """ 38 | API_BASE_URL = "https://api.twitterapi.io" 39 | # Try reading with underscore first (common in container envs), fallback to hyphen 40 | API_KEY = os.getenv("TWITTER_API_IO_KEY") 41 | 42 | if not API_KEY: 43 | # Update error message to reflect both attempts 44 | return "Error: TWITTER_API_IO_KEY not found in environment variables." 45 | 46 | # 1. Extract Tweet ID 47 | match = re.search(r"/status(?:es)?/(\d+)", url) 48 | if not match: 49 | return f"Error: Could not extract Tweet ID from URL: {url}" 50 | tweet_id = match.group(1) 51 | console.print(f"Extracted Tweet ID: {tweet_id}", style="cyan") 52 | 53 | headers = {"X-API-Key": API_KEY} 54 | all_tweets = [] 55 | conversation_id = None 56 | main_tweet_data = None 57 | 58 | # 2. Fetch the main tweet 59 | try: 60 | console.print(f"Fetching main tweet ID: {tweet_id}", style="cyan") 61 | main_tweet_url = f"{API_BASE_URL}/twitter/tweets" 62 | params = {"tweet_ids": [tweet_id]} 63 | response = requests.get(main_tweet_url, headers=headers, params=params) 64 | response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) 65 | 66 | data = response.json() 67 | 68 | if data.get("status") != "success" or not data.get("tweets"): 69 | error_msg = data.get("msg", "Unknown error") 70 | return f"Error: Failed to fetch main tweet {tweet_id}. API Status: {data.get('status')}, Msg: {error_msg}" 71 | 72 | main_tweet_data = data["tweets"][0] 73 | all_tweets.append(main_tweet_data) 74 | conversation_id = main_tweet_data.get("conversationId") 75 | console.print( 76 | f"Main tweet fetched. Conversation ID: {conversation_id}", style="green" 77 | ) 78 | 79 | except requests.exceptions.RequestException as e: 80 | return f"Error: Network or API error fetching main tweet {tweet_id}: {e}" 81 | except Exception as e: 82 | return f"Error: Unexpected error processing main tweet response: {e}" 83 | 84 | # 3. Fetch the conversation/thread if conversationId is valid and different from tweet_id 85 | # (A single tweet's conversationId is often its own tweet_id) 86 | if conversation_id and conversation_id != tweet_id: 87 | try: 88 | console.print( 89 | f"Fetching conversation thread ID: {conversation_id}", style="cyan" 90 | ) 91 | thread_url = f"{API_BASE_URL}/twitter/tweet/advanced_search" 92 | params = { 93 | "query": f"conversation_id:{conversation_id}", 94 | # Optionally add 'sort_order': 'recency' if API supports it for chronological order 95 | # Add other filters if needed, like 'since_id' using the main tweet ID? 96 | # Check API docs for best way to get replies *after* the main tweet 97 | } 98 | response = requests.get(thread_url, headers=headers, params=params) 99 | response.raise_for_status() 100 | 101 | data = response.json() 102 | 103 | if data.get("status") == "success" and data.get("tweets"): 104 | thread_tweets = data["tweets"] 105 | # Filter out the main tweet if it's included in the conversation results 106 | filtered_thread_tweets = [ 107 | t for t in thread_tweets if t.get("id") != tweet_id 108 | ] 109 | all_tweets.extend(filtered_thread_tweets) 110 | console.print( 111 | f"Fetched {len(filtered_thread_tweets)} additional tweets in conversation.", 112 | style="green", 113 | ) 114 | elif data.get("status") != "success": 115 | console.print( 116 | f"Warning: Failed to fetch conversation thread {conversation_id}. API Status: {data.get('status')}, Msg: {data.get('msg', 'Unknown error')}", 117 | style="yellow", 118 | ) 119 | # Proceed with only the main tweet 120 | 121 | except requests.exceptions.RequestException as e: 122 | console.print( 123 | f"Warning: Network or API error fetching conversation thread {conversation_id}: {e}", 124 | style="yellow", 125 | ) 126 | # Proceed with only the main tweet 127 | except Exception as e: 128 | console.print( 129 | f"Warning: Unexpected error processing conversation thread response: {e}", 130 | style="yellow", 131 | ) 132 | # Proceed with only the main tweet 133 | 134 | # 4. Sort tweets by creation date 135 | all_tweets.sort( 136 | key=lambda t: _parse_twitter_datetime( 137 | t.get("createdAt", "Thu Jan 01 00:00:00 +0000 1970") 138 | ) 139 | ) 140 | 141 | # 5. Format the output 142 | output_lines = [] 143 | for i, tweet in enumerate(all_tweets): 144 | author_info = tweet.get("author", {}) 145 | username = author_info.get("userName", "unknown_user") 146 | created_at_str = tweet.get("createdAt", "Unknown time") 147 | text = tweet.get("text", "").strip() 148 | 149 | # Basic formatting 150 | line = f"Tweet {i + 1}/{len(all_tweets)} by @{username} ({created_at_str}):\n{text}\n---" 151 | output_lines.append(line) 152 | 153 | if not output_lines: 154 | return f"Error: No tweet data could be formatted for tweet ID {tweet_id}." # Should not happen if main tweet fetch succeeded 155 | 156 | return "\n".join(output_lines).strip() 157 | 158 | 159 | # Example usage (for testing this script directly) 160 | if __name__ == "__main__": 161 | # Test with a known tweet URL (replace with a real one, potentially a thread) 162 | # test_url_single = "https://x.com/levelsio/status/1798629243934064791" # Example single tweet 163 | test_url_thread_start = "https://x.com/omarsar0/status/1917939469103305013?s=52" # Example thread start (replace if needed) 164 | 165 | print(f"--- Testing with URL: {test_url_thread_start} ---") 166 | result = fetch_tweet_thread(test_url_thread_start) 167 | print("\n--- RESULT ---") 168 | print(result) 169 | print("--- END TEST ---") 170 | -------------------------------------------------------------------------------- /tools/youtube_agentql_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | youtube_agentql_scraper.py 3 | 4 | Scrape a YouTube video's title and full description using Playwright + AgentQL. 5 | 6 | Prerequisites: 7 | pip install playwright agentql 8 | playwright install 9 | export AGENTQL_API_KEY= 10 | 11 | Usage: 12 | python youtube_agentql_scraper.py --url "https://www.youtube.com/watch?v=DqXVfRkY-WA" [--headless] 13 | 14 | The script will print the title and description for the given video URL. 15 | """ 16 | 17 | import argparse 18 | import os 19 | import textwrap 20 | from dotenv import load_dotenv 21 | import agentql 22 | from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError 23 | 24 | load_dotenv() 25 | 26 | 27 | def scrape_youtube(url: str, headless: bool = True) -> dict[str, str]: 28 | """Return the video's title and full description.""" 29 | 30 | # 0. Configure AgentQL 31 | agentql.configure(api_key=os.getenv("AGENTQL_API_KEY", "")) 32 | 33 | with sync_playwright() as p: 34 | browser = p.chromium.launch(headless=headless) 35 | page = agentql.wrap(browser.new_page()) 36 | 37 | # Navigate to the video URL and wait until the page is fully idle 38 | page.goto(url, wait_until="domcontentloaded") 39 | page.wait_for_page_ready_state() 40 | 41 | # 1. Accept cookies (EU banner) 42 | try: 43 | consent = page.query_elements( 44 | """ 45 | { 46 | accept_cookies_btn 47 | } 48 | """ 49 | ) 50 | consent.accept_cookies_btn.click(timeout=3000) 51 | except Exception: 52 | # Fallback for sites that use a different dialog text 53 | try: 54 | page.locator("button:has-text('Accept all')").click(timeout=3000) 55 | except PlaywrightTimeoutError: 56 | pass # No consent dialog present 57 | 58 | # 2. Expand the description (click “Show more”) 59 | try: 60 | controls = page.query_elements( 61 | """ 62 | { 63 | expand_description_btn 64 | } 65 | """ 66 | ) 67 | controls.expand_description_btn.click(timeout=3000) 68 | except Exception: 69 | # Fallback selector if AgentQL can’t find the button 70 | try: 71 | page.locator("tp-yt-paper-button:has-text('more')").click(timeout=3000) 72 | except PlaywrightTimeoutError: 73 | pass 74 | 75 | # 3. Extract the title and the full description using AgentQL 76 | data = page.query_data( 77 | """ 78 | { 79 | video_title 80 | description_text 81 | } 82 | """ 83 | ) 84 | 85 | browser.close() 86 | 87 | return { 88 | "title": data["video_title"], 89 | "description": textwrap.dedent(data["description_text"]).strip(), 90 | } 91 | 92 | 93 | if __name__ == "__main__": 94 | parser = argparse.ArgumentParser( 95 | description="Scrape YouTube title and description via Playwright + AgentQL." 96 | ) 97 | parser.add_argument( 98 | "--url", 99 | required=True, 100 | help="Full YouTube video URL", 101 | ) 102 | parser.add_argument( 103 | "--headless", 104 | action="store_true", 105 | help="Run browser in headless mode (default: GUI).", 106 | ) 107 | args = parser.parse_args() 108 | 109 | result = scrape_youtube(args.url, headless=args.headless) 110 | print("\n=== RESULT ===") 111 | print("Title:", result["title"]) 112 | print("\nDescription:\n", result["description"]) 113 | --------------------------------------------------------------------------------