├── .env.example ├── .gitignore ├── AGENTS.md ├── CLAUDE.md ├── README.md ├── eval ├── __init.py__ ├── email_dataset.py ├── evaluate_triage.py └── prompts.py ├── langgraph.json ├── notebooks ├── agent.ipynb ├── evaluation.ipynb ├── hitl.ipynb ├── img │ ├── HITL_flow.png │ ├── HITL_flow_memory.png │ ├── HITL_flow_triage.png │ ├── agent-inbox-draft.png │ ├── agent-inbox-edit.png │ ├── agent-inbox.png │ ├── agent.png │ ├── agent_example.png │ ├── agent_loop.png │ ├── agent_workflow.png │ ├── checkpoints.png │ ├── ecosystem.png │ ├── email_workflow.png │ ├── eval.png │ ├── eval_detail.png │ ├── eval_types.png │ ├── hitl_schematic.png │ ├── img_helper.py │ ├── langgraph_studio.png │ ├── memory-studio.png │ ├── nodes_edges.png │ ├── overview.png │ ├── overview_agent.png │ ├── overview_eval.png │ ├── overview_hitl.png │ ├── overview_memory.png │ ├── router.png │ ├── short-vs-long.png │ ├── studio-interrupt.png │ ├── studio.png │ ├── test_result.png │ ├── tool_call.png │ ├── tool_call_detail.png │ ├── workflow_example.png │ └── workflow_v_agent.png ├── langgraph_101.ipynb ├── memory.ipynb └── test_tools.py ├── pyproject.toml ├── src └── email_assistant │ ├── __init__.py │ ├── configuration.py │ ├── cron.py │ ├── email_assistant.py │ ├── email_assistant_hitl.py │ ├── email_assistant_hitl_memory.py │ ├── email_assistant_hitl_memory_gmail.py │ ├── langgraph_101.py │ ├── prompts.py │ ├── schemas.py │ ├── tools │ ├── __init__.py │ ├── base.py │ ├── default │ │ ├── __init__.py │ │ ├── calendar_tools.py │ │ ├── email_tools.py │ │ └── prompt_templates.py │ └── gmail │ │ ├── README.md │ │ ├── __init__.py │ │ ├── gmail_tools.py │ │ ├── prompt_templates.py │ │ ├── run_ingest.py │ │ ├── setup_cron.py │ │ └── setup_gmail.py │ └── utils.py └── tests ├── conftest.py ├── hitl_testing.ipynb ├── memory_testing.ipynb ├── run_all_tests.py ├── test_notebooks.py └── test_response.py /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=your_openai_api_key 2 | LANGSMITH_TRACING=true 3 | LANGSMITH_API_KEY=your_langsmith_api_key 4 | LANGSMITH_PROJECT="interrupt-workshop" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | .secrets/ 23 | 24 | # Jupyter Notebook 25 | .ipynb_checkpoints 26 | .langgraph_checkpoint 27 | *.ipynb 28 | !notebooks/*.ipynb 29 | !notebooks/notebooks/*.ipynb 30 | !tests/*.ipynb 31 | 32 | # Virtual Environment 33 | .venv 34 | venv/ 35 | env/ 36 | ENV/ 37 | .env 38 | 39 | # IDE 40 | .idea/ 41 | .vscode/ 42 | *.swp 43 | *.swo 44 | .DS_Store 45 | 46 | # Testing 47 | .coverage 48 | htmlcov/ 49 | .pytest_cache/ 50 | .tox/ 51 | 52 | # Distribution 53 | *.tar.gz 54 | *.whl 55 | 56 | # Misc 57 | .DS_Store 58 | .env.local 59 | .env.development.local 60 | .env.test.local 61 | .env.production.local 62 | .langgraph_api 63 | -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # Agents in this Repository 2 | 3 | ## Overview 4 | 5 | This repository demonstrates building agents using LangGraph, focusing on an email assistant that can: 6 | - Triage incoming emails 7 | - Draft appropriate responses 8 | - Execute actions (calendar scheduling, etc.) 9 | - Incorporate human feedback 10 | - Learn from past interactions 11 | 12 | ## Environment Setup 13 | 14 | ```bash 15 | # Create and activate a virtual environment 16 | python3 -m venv .venv 17 | source .venv/bin/activate 18 | 19 | # Ensure you have a recent version of pip (required for editable installs with pyproject.toml) 20 | python3 -m pip install --upgrade pip 21 | 22 | # Install the package in editable mode 23 | pip install -e . 24 | ``` 25 | 26 | ## Agent Implementations 27 | 28 | ### Scripts 29 | 30 | The repository contains several implementations with increasing complexity in `src/email_assistant`: 31 | 32 | 1. **LangGraph 101** (`langgraph_101.py`) 33 | - Basics of LangGraph 34 | 35 | 2. **Basic Email Assistant** (`email_assistant.py`) 36 | - Core email triage and response functionality 37 | 38 | 3. **Human-in-the-Loop** (`email_assistant_hitl.py`) 39 | - Adds ability for humans to review and approve actions 40 | 41 | 4. **Memory-Enabled HITL** (`email_assistant_hitl_memory.py`) 42 | - Adds persistent memory to learn from feedback 43 | 44 | 5. **Gmail Integration** (`email_assistant_hitl_memory_gmail.py`) 45 | - Connects to Gmail API for real email processing 46 | 47 | ### Notebooks 48 | 49 | Each aspect of the agent is explained in dedicated notebooks: 50 | - `notebooks/langgraph_101.ipynb` - LangGraph basics 51 | - `notebooks/agent.ipynb` - Basic agent implementation 52 | - `notebooks/evaluation.ipynb` - Agent evaluation 53 | - `notebooks/hitl.ipynb` - Human-in-the-loop functionality 54 | - `notebooks/memory.ipynb` - Adding memory capabilities 55 | 56 | ## Running Tests 57 | 58 | ### Testing Scripts 59 | 60 | Test to ensure all implementations work: 61 | 62 | ```bash 63 | # Test all implementations 64 | python tests/run_all_tests.py --all 65 | ``` 66 | 67 | (Note: This will leave out the Gmail implementation `email_assistant_hitl_memory_gmail` from testing.) 68 | 69 | ### Testing Notebooks 70 | 71 | Test all notebooks to ensure they run without errors: 72 | 73 | ```bash 74 | # Run all notebook tests directly 75 | python tests/test_notebooks.py 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # Agents in this Repository 2 | 3 | ## Overview 4 | 5 | This repository demonstrates building agents using LangGraph, focusing on an email assistant that can: 6 | - Triage incoming emails 7 | - Draft appropriate responses 8 | - Execute actions (calendar scheduling, etc.) 9 | - Incorporate human feedback 10 | - Learn from past interactions 11 | 12 | ## Environment Setup 13 | 14 | ```bash 15 | # Create and activate a virtual environment 16 | python3 -m venv .venv 17 | source .venv/bin/activate 18 | 19 | # Ensure you have a recent version of pip (required for editable installs with pyproject.toml) 20 | python3 -m pip install --upgrade pip 21 | 22 | # Install the package in editable mode 23 | pip install -e . 24 | ``` 25 | 26 | ## Agent Implementations 27 | 28 | ### Scripts 29 | 30 | The repository contains several implementations with increasing complexity in `src/email_assistant`: 31 | 32 | 1. **LangGraph 101** (`langgraph_101.py`) 33 | - Basics of LangGraph 34 | 35 | 2. **Basic Email Assistant** (`email_assistant.py`) 36 | - Core email triage and response functionality 37 | 38 | 3. **Human-in-the-Loop** (`email_assistant_hitl.py`) 39 | - Adds ability for humans to review and approve actions 40 | 41 | 4. **Memory-Enabled HITL** (`email_assistant_hitl_memory.py`) 42 | - Adds persistent memory to learn from feedback 43 | 44 | 5. **Gmail Integration** (`email_assistant_hitl_memory_gmail.py`) 45 | - Connects to Gmail API for real email processing 46 | 47 | ### Notebooks 48 | 49 | Each aspect of the agent is explained in dedicated notebooks: 50 | - `notebooks/langgraph_101.ipynb` - LangGraph basics 51 | - `notebooks/agent.ipynb` - Basic agent implementation 52 | - `notebooks/evaluation.ipynb` - Agent evaluation 53 | - `notebooks/hitl.ipynb` - Human-in-the-loop functionality 54 | - `notebooks/memory.ipynb` - Adding memory capabilities 55 | 56 | ## Running Tests 57 | 58 | ### Testing Scripts 59 | 60 | Test to ensure all implementations work: 61 | 62 | ```bash 63 | # Test all implementations 64 | python tests/run_all_tests.py --all 65 | ``` 66 | 67 | (Note: This will leave out the Gmail implementation `email_assistant_hitl_memory_gmail` from testing.) 68 | 69 | ### Testing Notebooks 70 | 71 | Test all notebooks to ensure they run without errors: 72 | 73 | ```bash 74 | # Run all notebook tests directly 75 | python tests/test_notebooks.py 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Agents From Scratch 2 | 3 | The repo is a guide to building agents from scratch. It builds up to an ["ambient"](https://blog.langchain.dev/introducing-ambient-agents/) agent that can manage your email with connection to the Gmail API. It's grouped into 4 sections, each with a notebook and accompanying code in the `src/email_assistant` directory. These section build from the basics of agents, to agent evaluation, to human-in-the-loop, and finally to memory. These all come together in an agent that you can deploy, and the principles can be applied to other agents across a wide range of tasks. 4 | 5 | ![overview](notebooks/img/overview.png) 6 | 7 | ## Environment Setup 8 | 9 | ### Python Version 10 | 11 | * Ensure you're using Python 3.11 or later. 12 | * This version is required for optimal compatibility with LangGraph. 13 | 14 | ```shell 15 | python3 --version 16 | ``` 17 | 18 | ### API Keys 19 | 20 | * If you don't have an OpenAI API key, you can sign up [here](https://openai.com/index/openai-api/). 21 | * Sign up for LangSmith [here](https://smith.langchain.com/). 22 | * Generate a LangSmith API key. 23 | 24 | ### Set Environment Variables 25 | 26 | * Create a `.env` file in the root directory: 27 | ```shell 28 | # Copy the .env.example file to .env 29 | cp .env.example .env 30 | ``` 31 | 32 | * Edit the `.env` file with the following: 33 | ```shell 34 | LANGSMITH_API_KEY=your_langsmith_api_key 35 | LANGSMITH_TRACING=true 36 | LANGSMITH_PROJECT="interrupt-workshop" 37 | OPENAI_API_KEY=your_openai_api_key 38 | ``` 39 | 40 | * You can also set the environment variables in your terminal: 41 | ```shell 42 | export LANGSMITH_API_KEY=your_langsmith_api_key 43 | export LANGSMITH_TRACING=true 44 | export OPENAI_API_KEY=your_openai_api_key 45 | ``` 46 | 47 | ### Create a virtual environment and activate it 48 | 49 | ```shell 50 | $ python3 -m venv .venv 51 | $ source .venv/bin/activate 52 | # Ensure you have a recent version of pip (required for editable installs with pyproject.toml) 53 | $ python3 -m pip install --upgrade pip 54 | # Install the package in editable mode 55 | $ pip install -e . 56 | ``` 57 | 58 | > **⚠️ IMPORTANT**: Do not skip the `pip install -e .` step! This editable install is **required** for the notebooks to work correctly. Without it, you'll get `ModuleNotFoundError: No module named 'email_assistant'` errors when running the notebooks. 59 | 60 | ## Structure 61 | 62 | The repo is organized into the 4 sections, with a notebook for each and accompanying code in the `src/email_assistant` directory. 63 | 64 | ### Preface: LangGraph 101 65 | For a brief introduction to LangGraph and some of the concepts used in this repo, see the [LangGraph 101 notebook](notebooks/langgraph_101.ipynb). This notebook explains the basics of chat models, tool calling, agents vs workflows, LangGraph nodes / edges / memory, and LangGraph Studio. 66 | 67 | ### Building an agent 68 | * Notebook: [notebooks/agent.ipynb](/notebooks/agent.ipynb) 69 | * Code: [src/email_assistant/email_assistant.py](/src/email_assistant/email_assistant.py) 70 | 71 | ![overview-agent](notebooks/img/overview_agent.png) 72 | 73 | This notebook shows how to build the email assistant, combining an [email triage step](https://langchain-ai.github.io/langgraph/tutorials/workflows/) with an agent that handles the email response. You can see the linked code for the full implementation in `src/email_assistant/email_assistant.py`. 74 | 75 | ![Screenshot 2025-04-04 at 4 06 18 PM](notebooks/img/studio.png) 76 | 77 | ### Evaluation 78 | * Notebook: [notebooks/evaluation.ipynb](/notebooks/evaluation.ipynb) 79 | 80 | ![overview-eval](notebooks/img/overview_eval.png) 81 | 82 | This notebook introduces evaluation with an email dataset in [eval/email_dataset.py](/eval/email_dataset.py). It shows how to run evaluations using Pytest and the LangSmith `evaluate` API. It runs evaluation for emails responses using LLM-as-a-judge as well as evaluations for tools calls and triage decisions. 83 | 84 | ![Screenshot 2025-04-08 at 8 07 48 PM](notebooks/img/eval.png) 85 | 86 | ### Human-in-the-loop 87 | * Notebook: [notebooks/hitl.ipynb](/notebooks/hitl.ipynb) 88 | * Code: [src/email_assistant/email_assistant_hitl.py](/src/email_assistant/email_assistant_hitl.py) 89 | 90 | ![overview-hitl](notebooks/img/overview_hitl.png) 91 | 92 | This notebooks shows how to add human-in-the-loop (HITL), allowing the user to review specific tool calls (e.g., send email, schedule meeting). For this, we use [Agent Inbox](https://github.com/langchain-ai/agent-inbox) as an interface for human in the loop. You can see the linked code for the full implementation in [src/email_assistant/email_assistant_hitl.py](/src/email_assistant/email_assistant_hitl.py). 93 | 94 | ![Agent Inbox showing email threads](notebooks/img/agent-inbox.png) 95 | 96 | ### Memory 97 | * Notebook: [notebooks/memory.ipynb](/notebooks/memory.ipynb) 98 | * Code: [src/email_assistant/email_assistant_hitl_memory.py](/src/email_assistant/email_assistant_hitl_memory.py) 99 | 100 | ![overview-memory](notebooks/img/overview_memory.png) 101 | 102 | This notebook shows how to add memory to the email assistant, allowing it to learn from user feedback and adapt to preferences over time. The memory-enabled assistant ([email_assistant_hitl_memory.py](/src/email_assistant/email_assistant_hitl_memory.py)) uses the [LangGraph Store](https://langchain-ai.github.io/langgraph/concepts/memory/#long-term-memory) to persist memories. You can see the linked code for the full implementation in [src/email_assistant/email_assistant_hitl_memory.py](/src/email_assistant/email_assistant_hitl_memory.py). 103 | 104 | ## Connecting to APIs 105 | 106 | The above notebooks using mock email and calendar tools. 107 | 108 | ### Gmail Integration 109 | 110 | Set up Google API credentials following the instructions in [Gmail Tools README](src/email_assistant/tools/gmail/README.md). 111 | 112 | The README also explains how to deploy the graph to LangGraph Platform. 113 | 114 | The full implementation of the Gmail integration is in [src/email_assistant/email_assistant_hitl_memory_gmail.py](/src/email_assistant/email_assistant_hitl_memory_gmail.py). 115 | 116 | ## Running Tests 117 | 118 | The repository includes an automated test suite to evaluate the email assistant implementations. Tests verify correct tool usage and response quality using LangSmith for tracking. 119 | 120 | ### Running Tests with [run_all_tests.py](/tests/run_all_tests.py) 121 | 122 | The test runner supports testing different implementations of the email assistant: 123 | 124 | ```shell 125 | # Run tests for the default implementation (email_assistant) 126 | python tests/run_all_tests.py 127 | 128 | # Run tests for a specific implementation 129 | python tests/run_all_tests.py --implementation email_assistant_hitl 130 | 131 | # Run tests for all available implementations 132 | python tests/run_all_tests.py --all 133 | 134 | # Add a specific experiment name for LangSmith tracking 135 | python tests/run_all_tests.py --experiment-name "Custom Test Run" 136 | ``` 137 | 138 | ### Test Results 139 | 140 | Test results are logged to LangSmith under the project name specified in your `.env` file (`LANGSMITH_PROJECT`). This provides: 141 | - Visual inspection of agent traces 142 | - Detailed evaluation metrics 143 | - Comparison of different agent implementations 144 | 145 | ### Available Test Implementations 146 | 147 | The available implementations for testing are: 148 | - `email_assistant` - Basic email assistant 149 | - `email_assistant_hitl` - Human-in-the-loop version 150 | - `email_assistant_hitl_memory` - Memory-enabled HITL version 151 | - `email_assistant_hitl_memory_gmail` - Gmail-integrated version 152 | 153 | ### Testing Notebooks 154 | 155 | You can also run tests to verify all notebooks execute without errors: 156 | 157 | ```shell 158 | # Run all notebook tests 159 | python tests/test_notebooks.py 160 | 161 | # Or run via pytest 162 | pytest tests/test_notebooks.py -v 163 | ``` 164 | 165 | ## Future Extensions 166 | 167 | Add [LangMem](https://langchain-ai.github.io/langmem/) to manage memories: 168 | * Manage a collection of background memories. 169 | * Add memory tools that can look up facts in the background memories. 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /eval/__init.py__: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/eval/__init.py__ -------------------------------------------------------------------------------- /eval/email_dataset.py: -------------------------------------------------------------------------------- 1 | """Email evaluation dataset with ground truth classifications.""" 2 | 3 | # Common reply email 4 | STANDARD_EMAIL = { 5 | "author": "Alice Smith ", 6 | "to": "John Doe ", 7 | "subject": "Quick question about API documentation", 8 | "email_thread": """Hi John, 9 | 10 | I was reviewing the API documentation for the new authentication service and noticed a few endpoints seem to be missing from the specs. Could you help clarify if this was intentional or if we should update the docs? 11 | 12 | Specifically, I'm looking at: 13 | - /auth/refresh 14 | - /auth/validate 15 | 16 | Thanks! 17 | Alice""", 18 | } 19 | 20 | # Common notification email 21 | NOTIFICATION_EMAIL = { 22 | "author": "System Admin ", 23 | "to": "Development Team ", 24 | "subject": "Scheduled maintenance - database downtime", 25 | "email_thread": """Hi team, 26 | 27 | This is a reminder that we'll be performing scheduled maintenance on the production database tonight from 2AM to 4AM EST. During this time, all database services will be unavailable. 28 | 29 | Please plan your work accordingly and ensure no critical deployments are scheduled during this window. 30 | 31 | Thanks, 32 | System Admin Team""" 33 | } 34 | 35 | # Dataset examples 36 | email_input_1 = { 37 | "author": "Alice Smith ", 38 | "to": "Lance Martin ", 39 | "subject": "Quick question about API documentation", 40 | "email_thread": """Hi Lance, 41 | 42 | I was reviewing the API documentation for the new authentication service and noticed a few endpoints seem to be missing from the specs. Could you help clarify if this was intentional or if we should update the docs? 43 | 44 | Specifically, I'm looking at: 45 | - /auth/refresh 46 | - /auth/validate 47 | 48 | Thanks! 49 | Alice""", 50 | } 51 | 52 | email_input_2 = { 53 | "author": "Marketing Team ", 54 | "to": "Lance Martin ", 55 | "subject": "New Company Newsletter Available", 56 | "email_thread": """Hello Lance, 57 | 58 | The latest edition of our company newsletter is now available on the intranet. This month features articles on our Q2 results, upcoming team building activities, and employee spotlights. 59 | 60 | Check it out when you have a chance! 61 | 62 | Best regards, 63 | Marketing Team""", 64 | } 65 | 66 | email_input_3 = { 67 | "author": "System Admin ", 68 | "to": "Lance Martin ", 69 | "subject": "Scheduled maintenance - database downtime", 70 | "email_thread": """Hi Lance, 71 | 72 | This is a reminder that we'll be performing scheduled maintenance on the production database tonight from 2AM to 4AM EST. During this time, all database services will be unavailable. 73 | 74 | Please plan your work accordingly and ensure no critical deployments are scheduled during this window. 75 | 76 | Thanks, 77 | System Admin Team""", 78 | } 79 | 80 | email_input_4 = { 81 | "author": "Project Manager ", 82 | "to": "Lance Martin ", 83 | "subject": "Tax season let's schedule call", 84 | "email_thread": """Lance, 85 | 86 | It's tax season again, and I wanted to schedule a call to discuss your tax planning strategies for this year. I have some suggestions that could potentially save you money. 87 | 88 | Are you available sometime next week? Tuesday or Thursday afternoon would work best for me, for about 45 minutes. 89 | 90 | Regards, 91 | Project Manager""", 92 | } 93 | 94 | email_input_5 = { 95 | "author": "HR Department ", 96 | "to": "Lance Martin ", 97 | "subject": "Reminder: Submit your expense reports", 98 | "email_thread": """Hello Lance, 99 | 100 | This is a friendly reminder that all expense reports for the previous month need to be submitted by this Friday. Please make sure to include all receipts and proper documentation. 101 | 102 | If you have any questions about the submission process, feel free to reach out to the HR team. 103 | 104 | Best regards, 105 | HR Department""", 106 | } 107 | 108 | email_input_6 = { 109 | "author": "Conference Organizer ", 110 | "to": "Lance Martin ", 111 | "subject": "Do you want to attend this conference?", 112 | "email_thread": """Hi Lance, 113 | 114 | We're reaching out to invite you to TechConf 2025, happening May 15-17 in San Francisco. 115 | 116 | The conference features keynote speakers from major tech companies, workshops on AI and ML, and great networking opportunities. Early bird registration is available until April 30th. 117 | 118 | Would you be interested in attending? We can also arrange for group discounts if other team members want to join. 119 | 120 | Best regards, 121 | Conference Organizers""", 122 | } 123 | 124 | email_input_7 = { 125 | "author": "Sarah Johnson ", 126 | "to": "Lance Martin ", 127 | "subject": "Can you review these docs before submission?", 128 | "email_thread": """Lance, 129 | 130 | I've attached the final version of our proposal for the Henderson project. Could you please review the technical specifications section (pages 15-20) before we submit it to the client on Friday? 131 | 132 | Your expertise would really help ensure we've covered all the necessary details. 133 | 134 | Thanks in advance, 135 | Sarah""", 136 | } 137 | 138 | email_input_8 = { 139 | "author": "Community Pool ", 140 | "to": "Lance Martin ", 141 | "subject": "Sign up daughter for swimming class", 142 | "email_thread": """Dear Lance, 143 | 144 | Summer swimming registration is now open! Based on your daughter's participation last year, we wanted to let you know that intermediate level classes are available on Mondays and Wednesdays at 4PM or Tuesdays and Thursdays at 5PM. 145 | 146 | Classes begin June 1st and run for 8 weeks. Space is limited, so early registration is recommended. 147 | 148 | Please let us know if you'd like to reserve a spot. 149 | 150 | Regards, 151 | City Recreation Department""", 152 | } 153 | 154 | email_input_9 = { 155 | "author": "GitHub ", 156 | "to": "Lance Martin ", 157 | "subject": "PR #42: Comment from alex-dev", 158 | "email_thread": """Hey there! 159 | 160 | alex-dev commented on your pull request #42 in langchain-ai/project: 161 | 162 | > I've reviewed the changes and everything looks good. Just one small suggestion for the error handling in auth_controller.py. Maybe we should add a timeout parameter to prevent hanging requests? 163 | 164 | View the comment: https://github.com/langchain-ai/project/pull/42#comment-12345 165 | 166 | --- 167 | You're receiving this because you authored the thread. 168 | Reply to this email directly, or view it on GitHub 169 | """, 170 | } 171 | 172 | email_input_10 = { 173 | "author": "Team Lead ", 174 | "to": "Lance Martin ", 175 | "subject": "Quarterly planning meeting", 176 | "email_thread": """Hi Lance, 177 | 178 | It's time for our quarterly planning session. I'd like to schedule a 90-minute meeting next week to discuss our roadmap for Q3. 179 | 180 | Could you let me know your availability for Monday or Wednesday? Ideally sometime between 10AM and 3PM. 181 | 182 | Looking forward to your input on the new feature priorities. 183 | 184 | Best, 185 | Team Lead""", 186 | } 187 | 188 | email_input_11 = { 189 | "author": "AWS Monitoring ", 190 | "to": "Lance Martin ", 191 | "subject": "System admin alert: Instance CPU utilization exceeds threshold", 192 | "email_thread": """ALERT: High CPU Utilization 193 | 194 | The following EC2 instance has exceeded the CPU utilization threshold of 90% for more than 15 minutes: 195 | 196 | Instance ID: i-0b2d3e4f5a6b7c8d9 197 | Region: us-west-2 198 | Current utilization: 95.3% 199 | 200 | This message is automatically generated. Please do not reply. 201 | """, 202 | } 203 | 204 | email_input_12 = { 205 | "author": "Client Success ", 206 | "to": "Lance Martin ", 207 | "subject": "Your subscription will renew automatically", 208 | "email_thread": """Hello Lance, 209 | 210 | This is a friendly reminder that your annual subscription to our Developer Pro plan will automatically renew on 04/15/2025. 211 | 212 | Your payment method ending in **** 4567 will be charged $1,499.00. 213 | 214 | If you would like to make any changes to your subscription, please visit your account settings or contact our support team before the renewal date. 215 | 216 | Thank you for your continued business! 217 | 218 | Client Success Team""", 219 | } 220 | 221 | email_input_13 = { 222 | "author": "Dr. Roberts ", 223 | "to": "Lance Martin ", 224 | "subject": "Annual checkup reminder", 225 | "email_thread": """Hello Lance, 226 | 227 | This is a reminder that it's time for your annual checkup. Our records show that your last visit was approximately one year ago. 228 | 229 | Please call our office at (555) 123-4567 to schedule an appointment at your earliest convenience. 230 | 231 | Best regards, 232 | Dr. Roberts' Office""", 233 | } 234 | 235 | email_input_14 = { 236 | "author": "Social Media Platform ", 237 | "to": "Lance Martin ", 238 | "subject": "5 people liked your post", 239 | "email_thread": """Hi Lance, 240 | 241 | 5 people liked your recent post about "Machine Learning Techniques for NLP" 242 | 243 | See who liked your post and continue the conversation! 244 | 245 | [View activity] 246 | 247 | To unsubscribe from these notifications, adjust your settings here. 248 | """, 249 | } 250 | 251 | email_input_15 = { 252 | "author": "Project Team ", 253 | "to": "Lance Martin ", 254 | "subject": "Joint presentation next month", 255 | "email_thread": """Hi Lance, 256 | 257 | The leadership team has asked us to prepare a joint presentation on our recent project successes for the all-hands meeting next month. 258 | 259 | I've started putting together some slides and would appreciate your input on the technical architecture section. Could we schedule about 60 minutes sometime in the next week to collaborate on this? 260 | 261 | I'm generally free on Tuesdays and Thursdays. 262 | 263 | Thanks, 264 | Project Team""", 265 | } 266 | 267 | email_input_16 = { 268 | "author": "Marketing Team ", 269 | "to": "Lance Martin ", 270 | "subject": "Newsletter: New Model from OpenAI", 271 | "email_thread": """Hi Lance, 272 | 273 | We're excited to announce that we've released a new model from OpenAI! 274 | 275 | It's called "GPT-5" and it's a successor to GPT-4. 276 | 277 | It's available now and you can find more information [here](https://openai.com/gpt-5). 278 | 279 | Thanks, 280 | Marketing Team""", 281 | } 282 | 283 | # Triage outputs: "ignore", "notify", "respond" 284 | triage_output_1 = "respond" 285 | triage_output_2 = "ignore" 286 | triage_output_3 = "notify" 287 | triage_output_4 = "respond" 288 | triage_output_5 = "notify" 289 | triage_output_6 = "respond" 290 | triage_output_7 = "respond" 291 | triage_output_8 = "respond" 292 | triage_output_9 = "notify" 293 | triage_output_10 = "respond" 294 | triage_output_11 = "notify" 295 | triage_output_12 = "notify" 296 | triage_output_13 = "respond" 297 | triage_output_14 = "ignore" 298 | triage_output_15 = "respond" 299 | triage_output_16 = "notify" 300 | 301 | # Response criteria (when applicable) 302 | response_criteria_1 = """ 303 | • Send email with write_email tool call to acknowledge the question and confirm it will be investigated 304 | """ 305 | 306 | response_criteria_2 = """ 307 | • No response needed 308 | • Ensure this is ignored 309 | """ 310 | 311 | response_criteria_3 = """ 312 | • No response needed 313 | • Ensure the user is notified 314 | """ 315 | 316 | response_criteria_4 = """ 317 | • Check calendar availability for Tuesday or Thursday afternoon next week with check_calendar_availability tool call 318 | • Confirm availability for a 45-minute meeting 319 | • Send calendar invite with schedule_meeting tool call 320 | • Send email with write_email tool call to acknowledge tax planning request and notifying that a meeting has been scheduled 321 | """ 322 | 323 | response_criteria_5 = """ 324 | • No response needed 325 | • Ensure the user is notified 326 | """ 327 | 328 | response_criteria_6 = """ 329 | • Express interest in attending TechConf 2025 330 | • Ask specific questions about AI/ML workshops 331 | • Inquire about group discount details 332 | • Send email with write_email tool call to express interest in attending TechConf 2025, ask specific questions about AI/ML workshops, and inquire about group discount details 333 | """ 334 | 335 | response_criteria_7 = """ 336 | • Explicitly agree to review the technical specifications 337 | • Acknowledge Friday deadline 338 | • Send email with write_email tool call to explicitly agree to review the technical specifications and acknowledge Friday deadline 339 | """ 340 | 341 | response_criteria_8 = """ 342 | • Send email with write_email tool call to express interest in registering daughter for swimming class 343 | """ 344 | 345 | response_criteria_9 = """ 346 | • No response needed 347 | • Ensure the user is notified 348 | """ 349 | 350 | response_criteria_10 = """ 351 | • Check calendar for 90-minute meeting availability for Monday or Wednesday with check_calendar_availability tool call 352 | • Send email acknowledging the request and providing availability with write_email tool call 353 | """ 354 | 355 | response_criteria_11 = """ 356 | • No response needed 357 | • Ensure the user is notified 358 | """ 359 | 360 | response_criteria_12 = """ 361 | • No response needed 362 | • Ensure the user is notified 363 | """ 364 | 365 | response_criteria_13 = """ 366 | • Acknowledge annual checkup reminder 367 | • Send email with write_email tool call to acknowledge annual checkup reminder 368 | """ 369 | 370 | response_criteria_14 = """ 371 | • No response needed 372 | • Ensure this is ignored 373 | """ 374 | 375 | response_criteria_15 = """ 376 | • Check calendar for 60-minute meeting availability for Tuesday or Thursday with check_calendar_availability tool call 377 | • Send calendar invite with schedule_meeting tool call 378 | • Send email agreeing to collaborate on the joint presentation and notifying that a meeting has been scheduled with write_email tool call 379 | """ 380 | 381 | response_criteria_16 = """ 382 | • No response needed 383 | • Ensure the user is notified 384 | """ 385 | 386 | examples_triage = [ 387 | { 388 | "inputs": {"email_input": email_input_1}, 389 | "outputs": {"classification": triage_output_1}, 390 | }, 391 | { 392 | "inputs": {"email_input": email_input_2}, 393 | "outputs": {"classification": triage_output_2}, 394 | }, 395 | { 396 | "inputs": {"email_input": email_input_3}, 397 | "outputs": {"classification": triage_output_3}, 398 | }, 399 | { 400 | "inputs": {"email_input": email_input_4}, 401 | "outputs": {"classification": triage_output_4}, 402 | }, 403 | { 404 | "inputs": {"email_input": email_input_5}, 405 | "outputs": {"classification": triage_output_5}, 406 | }, 407 | { 408 | "inputs": {"email_input": email_input_6}, 409 | "outputs": {"classification": triage_output_6}, 410 | }, 411 | { 412 | "inputs": {"email_input": email_input_7}, 413 | "outputs": {"classification": triage_output_7}, 414 | }, 415 | { 416 | "inputs": {"email_input": email_input_8}, 417 | "outputs": {"classification": triage_output_8}, 418 | }, 419 | { 420 | "inputs": {"email_input": email_input_9}, 421 | "outputs": {"classification": triage_output_9}, 422 | }, 423 | { 424 | "inputs": {"email_input": email_input_10}, 425 | "outputs": {"classification": triage_output_10}, 426 | }, 427 | { 428 | "inputs": {"email_input": email_input_11}, 429 | "outputs": {"classification": triage_output_11}, 430 | }, 431 | { 432 | "inputs": {"email_input": email_input_12}, 433 | "outputs": {"classification": triage_output_12}, 434 | }, 435 | { 436 | "inputs": {"email_input": email_input_13}, 437 | "outputs": {"classification": triage_output_13}, 438 | }, 439 | { 440 | "inputs": {"email_input": email_input_14}, 441 | "outputs": {"classification": triage_output_14}, 442 | }, 443 | { 444 | "inputs": {"email_input": email_input_15}, 445 | "outputs": {"classification": triage_output_15}, 446 | }, 447 | { 448 | "inputs": {"email_input": email_input_16}, 449 | "outputs": {"classification": triage_output_16}, 450 | }, 451 | ] 452 | 453 | email_inputs = [ 454 | email_input_1, email_input_2, email_input_3, email_input_4, email_input_5, 455 | email_input_6, email_input_7, email_input_8, email_input_9, email_input_10, 456 | email_input_11, email_input_12, email_input_13, email_input_14, email_input_15, 457 | email_input_16 458 | ] 459 | 460 | email_names = [ 461 | "email_input_1", "email_input_2", "email_input_3", "email_input_4", "email_input_5", 462 | "email_input_6", "email_input_7", "email_input_8", "email_input_9", "email_input_10", 463 | "email_input_11", "email_input_12", "email_input_13", "email_input_14", "email_input_15", 464 | "email_input_16" 465 | ] 466 | 467 | response_criteria_list = [ 468 | response_criteria_1, response_criteria_2, response_criteria_3, response_criteria_4, response_criteria_5, 469 | response_criteria_6, response_criteria_7, response_criteria_8, response_criteria_9, response_criteria_10, 470 | response_criteria_11, response_criteria_12, response_criteria_13, response_criteria_14, response_criteria_15, 471 | response_criteria_16 472 | ] 473 | 474 | triage_outputs_list = [ 475 | triage_output_1, triage_output_2, triage_output_3, triage_output_4, triage_output_5, 476 | triage_output_6, triage_output_7, triage_output_8, triage_output_9, triage_output_10, 477 | triage_output_11, triage_output_12, triage_output_13, triage_output_14, triage_output_15, 478 | triage_output_16 479 | ] 480 | 481 | # Define expected tool calls for each email response based on content analysis 482 | # Options: write_email, schedule_meeting, check_calendar_availability, done 483 | expected_tool_calls = [ 484 | ["write_email", "done"], # email_input_1: API documentation question 485 | [], # email_input_2: Newsletter notification - ignore 486 | [], # email_input_3: System maintenance notification - notification only 487 | ["check_calendar_availability", "schedule_meeting", "write_email", "done"], # email_input_4: Tax call scheduling 488 | [], # email_input_5: Expense report reminder - notification only 489 | ["write_email", "done"], # email_input_6: Conference invitation - needs response 490 | ["write_email", "done"], # email_input_7: Document review request 491 | ["write_email", "done"], # email_input_8: Swimming class registration 492 | [], # email_input_9: GitHub PR comment - notification only 493 | ["check_calendar_availability", "write_email", "done"], # email_input_10: Planning meeting 494 | [], # email_input_11: AWS alert - notification only 495 | [], # email_input_12: Subscription renewal - ignore 496 | ["write_email", "done"], # email_input_13: Doctor appointment reminder 497 | [], # email_input_14: Social media notification - no action needed 498 | ["check_calendar_availability", "schedule_meeting", "write_email", "done"], # email_input_15: Joint presentation 499 | [], # email_input_16: Newsletter - notification only 500 | ] -------------------------------------------------------------------------------- /eval/evaluate_triage.py: -------------------------------------------------------------------------------- 1 | from langsmith import Client 2 | from langsmith import testing as t 3 | 4 | import os 5 | import matplotlib.pyplot as plt 6 | from datetime import datetime 7 | 8 | from eval.email_dataset import examples_triage 9 | 10 | from src.email_assistant.email_assistant import email_assistant 11 | 12 | # Client 13 | client = Client() 14 | 15 | # Dataset name 16 | dataset_name = "Interrupt Workshop: E-mail Triage Dataset" 17 | 18 | # If the dataset doesn't exist, create it 19 | if not client.has_dataset(dataset_name=dataset_name): 20 | 21 | # Create the dataset 22 | dataset = client.create_dataset( 23 | dataset_name=dataset_name, 24 | description="A dataset of e-mails and their triage decisions." 25 | ) 26 | 27 | # Add examples to the dataset 28 | client.create_examples(dataset_id=dataset.id, examples=examples_triage) 29 | 30 | # Target functions that run our email assistants 31 | def target_email_assistant(inputs: dict) -> dict: 32 | """Process an email through the workflow-based email assistant. 33 | 34 | Args: 35 | inputs: A dictionary containing the email_input field from the dataset 36 | 37 | Returns: 38 | A formatted dictionary with the assistant's response messages 39 | """ 40 | try: 41 | response = email_assistant.invoke({"email_input": inputs["email_input"]}) 42 | if "classification_decision" in response: 43 | return {"classification_decision": response['classification_decision']} 44 | else: 45 | print("No classification_decision in response from workflow agent") 46 | return {"classification_decision": "unknown"} 47 | except Exception as e: 48 | print(f"Error in workflow agent: {e}") 49 | return {"classification_decision": "unknown"} 50 | 51 | ## Evaluator 52 | feedback_key = "classification" # Key saved to langsmith 53 | 54 | def classification_evaluator(outputs: dict, reference_outputs: dict) -> bool: 55 | """Check if the answer exactly matches the expected answer.""" 56 | return outputs["classification_decision"].lower() == reference_outputs["classification"].lower() 57 | 58 | experiment_results_workflow = client.evaluate( 59 | # Run agent 60 | target_email_assistant, 61 | # Dataset name 62 | data=dataset_name, 63 | # Evaluator 64 | evaluators=[ 65 | classification_evaluator 66 | ], 67 | # Name of the experiment 68 | experiment_prefix="E-mail assistant workflow", 69 | # Number of concurrent evaluations 70 | max_concurrency=2, 71 | ) 72 | 73 | ## Add visualization 74 | # Convert evaluation results to pandas dataframes 75 | df_workflow = experiment_results_workflow.to_pandas() 76 | 77 | # Calculate mean scores (values are on a 0-1 scale) 78 | workflow_score = df_workflow[f'feedback.classification_evaluator'].mean() if f'feedback.classification_evaluator' in df_workflow.columns else 0.0 79 | 80 | # Create a bar plot comparing the two models 81 | plt.figure(figsize=(10, 6)) 82 | models = ['Agentic Workflow'] 83 | scores = [workflow_score] 84 | 85 | # Create bars with distinct colors 86 | plt.bar(models, scores, color=['#5DA5DA', '#FAA43A'], width=0.5) 87 | 88 | # Add labels and title 89 | plt.xlabel('Agent Type') 90 | plt.ylabel('Average Score') 91 | plt.title(f'Email Triage Performance Comparison - {feedback_key.capitalize()} Score') 92 | 93 | # Add score values on top of bars 94 | for i, score in enumerate(scores): 95 | plt.text(i, score + 0.02, f'{score:.2f}', ha='center', fontweight='bold') 96 | 97 | # Set y-axis limit 98 | plt.ylim(0, 1.1) 99 | 100 | # Add grid lines for better readability 101 | plt.grid(axis='y', linestyle='--', alpha=0.7) 102 | 103 | # Ensure the output directory exists 104 | os.makedirs('eval/results', exist_ok=True) 105 | 106 | # Save the plot with timestamp 107 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 108 | plot_path = f'eval/results/triage_comparison_{timestamp}.png' 109 | plt.savefig(plot_path) 110 | plt.close() 111 | 112 | print(f"\nEvaluation visualization saved to: {plot_path}") 113 | print(f"Agent With Router Score: {workflow_score:.2f}") 114 | 115 | -------------------------------------------------------------------------------- /eval/prompts.py: -------------------------------------------------------------------------------- 1 | # Used in /eval/evaluate_triage.py 2 | TRIAGE_CLASSIFICATION_PROMPT = """ 3 | 4 | 5 | You are evaluating the classification of emails. 6 | 7 | They should be be classified into one of the following categories: 8 | - ignore 9 | - notify 10 | - respond 11 | 12 | You will be given: 13 | - the email_input 14 | - the agent's reasoning and decision as a list of messages 15 | - the correct classification 16 | 17 | Your job is to evaluate the agent's reasoning and decision relative to the correct classification. 18 | 19 | 20 | 21 | {inputs} 22 | 23 | 24 | 25 | {outputs} 26 | 27 | 28 | 29 | {reference_outputs} 30 | 31 | """ 32 | 33 | # Used in /tests/test_email_assistant.py 34 | RESPONSE_CRITERIA_SYSTEM_PROMPT = """You are evaluating an email assistant that works on behalf of a user, Lance Martin. 35 | 36 | You will see a sequence of messages, starting with an email sent to Lance Martin . 37 | 38 | You will then see the assistant's response to this email on behalf of Lance Martin, which includes any tool calls made (e.g., write_email, schedule_meeting, check_calendar_availability, done). 39 | 40 | You will also see a list of criteria that the assistant's response must meet. 41 | 42 | Your job is to evaluate if the assistant's response meets ALL the criteria bullet points provided. 43 | 44 | IMPORTANT EVALUATION INSTRUCTIONS: 45 | 1. The assistant's response is formatted as a list of messages. 46 | 2. The response criteria are formatted as bullet points (•) 47 | 3. You must evaluate the response against EACH bullet point individually 48 | 4. ALL bullet points must be met for the response to receive a 'True' grade 49 | 5. For each bullet point, cite specific text from the response that satisfies or fails to satisfy it 50 | 6. Be objective and rigorous in your evaluation 51 | 7. In your justification, clearly indicate which criteria were met and which were not 52 | 7. If ANY criteria are not met, the overall grade must be 'False' 53 | 54 | Your output will be used for automated testing, so maintain a consistent evaluation approach.""" 55 | 56 | # Used in /tests/test_hitl.py 57 | HITL_FEEDBACK_SYSTEM_PROMPT = """You are evaluating an email assistant's response to determine if it meets specific criteria. 58 | 59 | This is an email assistant that is used to respond to emails. Review our initial email response and the user feedback given to update the email response. Here is the feedback: {feedback}. Assess whether the final email response addresses the feedback that we gave.""" 60 | 61 | # Used in /tests/test_memory.py 62 | MEMORY_UPDATE_SYSTEM_PROMPT = """This is an email assistant that uses memory to update its response preferences. 63 | 64 | Review the initial response preferences and the updated response preferences. Assess whether the updated response preferences are more accurate than the initial response preferences.""" -------------------------------------------------------------------------------- /langgraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "dockerfile_lines": [], 3 | "graphs": { 4 | "langgraph101": "./src/email_assistant/langgraph_101.py:app", 5 | "email_assistant": "./src/email_assistant/email_assistant.py:email_assistant", 6 | "email_assistant_hitl": "./src/email_assistant/email_assistant_hitl.py:email_assistant", 7 | "email_assistant_hitl_memory": "./src/email_assistant/email_assistant_hitl_memory.py:email_assistant", 8 | "email_assistant_hitl_memory_gmail": "./src/email_assistant/email_assistant_hitl_memory_gmail.py:email_assistant", 9 | "cron": "./src/email_assistant/cron.py:graph" 10 | }, 11 | "python_version": "3.11", 12 | "env": ".env", 13 | "dependencies": [ 14 | "." 15 | ] 16 | } -------------------------------------------------------------------------------- /notebooks/agent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7bb66df4", 6 | "metadata": {}, 7 | "source": [ 8 | "# Building Agents \n", 9 | " \n", 10 | "> Note: Optionally, see [these slides](https://docs.google.com/presentation/d/13c0L1CQWAL7fuCXakOqjkvoodfynPJI4Hw_4H76okVU/edit?usp=sharing) and [langgraph_101.ipynb](langgraph_101.ipynb) for context before diving into this notebook!\n", 11 | "\n", 12 | "We're going to build an email assistant from scratch, starting here with 1) the agent architecture (using [LangGraph](https://langchain-ai.github.io/langgraph/)) and following with 2) testing (using [LangSmith](https://docs.smith.langchain.com/)), 3) human-in-the-loop, and 4) memory. This diagram show how these pieces will fit together:\n", 13 | "\n", 14 | "![overview-img](img/overview.png)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "19d34429", 20 | "metadata": {}, 21 | "source": [ 22 | "#### Load environment variables" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "46c9f78e", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from dotenv import load_dotenv\n", 33 | "load_dotenv(\"../.env\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "54a69e9a", 39 | "metadata": {}, 40 | "source": [ 41 | "## Tool Definition\n", 42 | "\n", 43 | "Let's start by defining some simple tools that an email assistant will use with the `@tool` decorator:" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "f2b708ec", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from typing import Literal\n", 54 | "from datetime import datetime\n", 55 | "from pydantic import BaseModel\n", 56 | "from langchain_core.tools import tool\n", 57 | "\n", 58 | "@tool\n", 59 | "def write_email(to: str, subject: str, content: str) -> str:\n", 60 | " \"\"\"Write and send an email.\"\"\"\n", 61 | " # Placeholder response - in real app would send email\n", 62 | " return f\"Email sent to {to} with subject '{subject}' and content: {content}\"\n", 63 | "\n", 64 | "@tool\n", 65 | "def schedule_meeting(\n", 66 | " attendees: list[str], subject: str, duration_minutes: int, preferred_day: datetime, start_time: int\n", 67 | ") -> str:\n", 68 | " \"\"\"Schedule a calendar meeting.\"\"\"\n", 69 | " # Placeholder response - in real app would check calendar and schedule\n", 70 | " date_str = preferred_day.strftime(\"%A, %B %d, %Y\")\n", 71 | " return f\"Meeting '{subject}' scheduled on {date_str} at {start_time} for {duration_minutes} minutes with {len(attendees)} attendees\"\n", 72 | "\n", 73 | "@tool\n", 74 | "def check_calendar_availability(day: str) -> str:\n", 75 | " \"\"\"Check calendar availability for a given day.\"\"\"\n", 76 | " # Placeholder response - in real app would check actual calendar\n", 77 | " return f\"Available times on {day}: 9:00 AM, 2:00 PM, 4:00 PM\"\n", 78 | "\n", 79 | "@tool\n", 80 | "class Done(BaseModel):\n", 81 | " \"\"\"E-mail has been sent.\"\"\"\n", 82 | " done: bool" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "2911c929-5c41-4dcd-9cc8-21a8ff82b769", 88 | "metadata": {}, 89 | "source": [ 90 | "## Building our email assistant\n", 91 | "\n", 92 | "We'll combine a [router and agent](https://langchain-ai.github.io/langgraph/tutorials/workflows/) to build our email assistant.\n", 93 | "\n", 94 | "![agent_workflow_img](img/email_workflow.png)\n", 95 | "\n", 96 | "### Router\n", 97 | "\n", 98 | "The routing step handles the triage decision. \n", 99 | "\n", 100 | "The triage router only focuses on the triage decision, while the agent focuses *only* on the response. \n", 101 | "\n", 102 | "#### State\n", 103 | "\n", 104 | "When building an agent, it's important to consider the information that you want to track over time. We'll use LangGraph's pre-built [`MessagesState` object](https://langchain-ai.github.io/langgraph/concepts/low_level/#messagesstate), which is a just dictionary with a `messages` key that appends messages returned by nodes [as its update logic](https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers). However, LangGraph gives you flexibility to track other information. We'll define a custom `State` object that extends `MessagesState` and adds a `classification_decision` key:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "692537ec-f09e-4086-81e4-9c517273b854", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "from langgraph.graph import MessagesState\n", 115 | "\n", 116 | "class State(MessagesState):\n", 117 | " # We can add a specific key to our state for the email input\n", 118 | " email_input: dict\n", 119 | " classification_decision: Literal[\"ignore\", \"respond\", \"notify\"]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "d6cd1647-6d58-4aae-b954-6a9c5790c20c", 125 | "metadata": {}, 126 | "source": [ 127 | "#### Triage node\n", 128 | "\n", 129 | "We define a python function with our triage routing logic.\n", 130 | "\n", 131 | "> **Note:** here we change to the parent directory (`%cd ..`) to access our project's module structure, which contains reusable prompts and components. The autoreload extensions ensure any changes to these modules are automatically reflected in the notebook without requiring kernel restarts. This allows us to organize our prompts in a dedicated module rather than defining them inline, making them easier to maintain and reuse across the notebooks! You can see all these files in: `src/email_assistant`\n", 132 | "\n", 133 | "For this, we use [structured outputs](https://python.langchain.com/docs/concepts/structured_outputs/) with a Pydantic model, which is particularly useful for defining structured output schemas because it offers type hints and validation. The descriptions in the pydantic model are important because they get passed as part JSON schema to the LLM to inform the output coercion." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "8adf520b-adf5-4a7b-b7a8-b8c23720c03f", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "%cd ..\n", 144 | "%load_ext autoreload\n", 145 | "%autoreload 2\n", 146 | "\n", 147 | "from pydantic import BaseModel, Field\n", 148 | "from email_assistant.utils import parse_email, format_email_markdown\n", 149 | "from email_assistant.prompts import triage_system_prompt, triage_user_prompt, default_triage_instructions, default_background\n", 150 | "from langchain.chat_models import init_chat_model\n", 151 | "from langgraph.graph import END\n", 152 | "from langgraph.types import Command" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "2c2c2ff0-da93-4731-b5b6-0ccd59e0e783", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "triage_system_prompt" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "f3a1ad2c-40a2-42d0-a4b8-7a25df825fad", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "triage_user_prompt" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "69b0df31-b9d2-423f-ba07-67eb0643c2ba", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "default_background" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "4b3ea767-6ac1-4562-8ca6-5fa451495786", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "default_triage_instructions" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "id": "c54ae6a6-94d9-4160-8d45-18f4d29aa600", 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "class RouterSchema(BaseModel):\n", 203 | " \"\"\"Analyze the unread email and route it according to its content.\"\"\"\n", 204 | "\n", 205 | " reasoning: str = Field(\n", 206 | " description=\"Step-by-step reasoning behind the classification.\"\n", 207 | " )\n", 208 | " classification: Literal[\"ignore\", \"respond\", \"notify\"] = Field(\n", 209 | " description=\"The classification of an email: 'ignore' for irrelevant emails, \"\n", 210 | " \"'notify' for important information that doesn't need a response, \"\n", 211 | " \"'respond' for emails that need a reply\",\n", 212 | " )\n", 213 | "\n", 214 | "# Initialize the LLM for use with router / structured output\n", 215 | "llm = init_chat_model(\"openai:gpt-4.1\", temperature=0.0)\n", 216 | "llm_router = llm.with_structured_output(RouterSchema) \n", 217 | "\n", 218 | "def triage_router(state: State) -> Command[Literal[\"response_agent\", \"__end__\"]]:\n", 219 | " \"\"\"Analyze email content to decide if we should respond, notify, or ignore.\"\"\"\n", 220 | " \n", 221 | " author, to, subject, email_thread = parse_email(state[\"email_input\"])\n", 222 | " system_prompt = triage_system_prompt.format(\n", 223 | " background=default_background,\n", 224 | " triage_instructions=default_triage_instructions\n", 225 | " )\n", 226 | "\n", 227 | " user_prompt = triage_user_prompt.format(\n", 228 | " author=author, to=to, subject=subject, email_thread=email_thread\n", 229 | " )\n", 230 | "\n", 231 | " result = llm_router.invoke(\n", 232 | " [\n", 233 | " {\"role\": \"system\", \"content\": system_prompt},\n", 234 | " {\"role\": \"user\", \"content\": user_prompt},\n", 235 | " ]\n", 236 | " )\n", 237 | " \n", 238 | " if result.classification == \"respond\":\n", 239 | " print(\"📧 Classification: RESPOND - This email requires a response\")\n", 240 | " goto = \"response_agent\"\n", 241 | " update = {\n", 242 | " \"messages\": [\n", 243 | " {\n", 244 | " \"role\": \"user\",\n", 245 | " \"content\": f\"Respond to the email: \\n\\n{format_email_markdown(subject, author, to, email_thread)}\",\n", 246 | " }\n", 247 | " ],\n", 248 | " \"classification_decision\": result.classification,\n", 249 | " }\n", 250 | " \n", 251 | " elif result.classification == \"ignore\":\n", 252 | " print(\"🚫 Classification: IGNORE - This email can be safely ignored\")\n", 253 | " goto = END\n", 254 | " update = {\n", 255 | " \"classification_decision\": result.classification,\n", 256 | " }\n", 257 | " \n", 258 | " elif result.classification == \"notify\":\n", 259 | " print(\"🔔 Classification: NOTIFY - This email contains important information\")\n", 260 | " # For now, we go to END. But we will add to this later!\n", 261 | " goto = END\n", 262 | " update = {\n", 263 | " \"classification_decision\": result.classification,\n", 264 | " }\n", 265 | " \n", 266 | " else:\n", 267 | " raise ValueError(f\"Invalid classification: {result.classification}\")\n", 268 | " return Command(goto=goto, update=update)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "id": "272d8715", 274 | "metadata": {}, 275 | "source": [ 276 | "We use [Command](https://langchain-ai.github.io/langgraph/how-tos/command/) objects in LangGraph to both update the state and select the next node to visit. This is a useful alternative to edges.\n", 277 | "\n", 278 | "### Agent\n", 279 | "\n", 280 | "Now, let's build the agent.\n", 281 | "\n", 282 | "#### LLM node\n", 283 | "\n", 284 | "Here, we define the LLM decision-making node. This node takes in the current state, calls the LLM, and updates `messages` with the LLM output. \n", 285 | "\n", 286 | "We [enforce tool use with OpenAI](https://python.langchain.com/docs/how_to/tool_choice/) by setting `tool_choice=\"required\"`." 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "id": "1e842b3c-06f5-440f-8159-995503ef3a99", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "from src.email_assistant.tools.default.prompt_templates import AGENT_TOOLS_PROMPT\n", 297 | "from src.email_assistant.prompts import agent_system_prompt, default_response_preferences, default_cal_preferences" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "8f69c6fc-70aa-48f1-8312-2b1818469a1b", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "AGENT_TOOLS_PROMPT" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "9052fced-3fdb-4cd2-ac88-e2ccdce14e7c", 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "agent_system_prompt" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "6f2c120f", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "# Collect all tools\n", 328 | "tools = [write_email, schedule_meeting, check_calendar_availability, Done]\n", 329 | "tools_by_name = {tool.name: tool for tool in tools}\n", 330 | "\n", 331 | "# Initialize the LLM, enforcing tool use\n", 332 | "llm = init_chat_model(\"openai:gpt-4.1\", temperature=0.0)\n", 333 | "llm_with_tools = llm.bind_tools(tools, tool_choice=\"any\")\n", 334 | "\n", 335 | "def llm_call(state: State):\n", 336 | " \"\"\"LLM decides whether to call a tool or not\"\"\"\n", 337 | "\n", 338 | " return {\n", 339 | " \"messages\": [\n", 340 | " # Invoke the LLM\n", 341 | " llm_with_tools.invoke(\n", 342 | " # Add the system prompt\n", 343 | " [ \n", 344 | " {\"role\": \"system\", \"content\": agent_system_prompt.format(\n", 345 | " tools_prompt=AGENT_TOOLS_PROMPT,\n", 346 | " background=default_background,\n", 347 | " response_preferences=default_response_preferences,\n", 348 | " cal_preferences=default_cal_preferences, \n", 349 | " )}\n", 350 | " ]\n", 351 | " # Add the current messages to the prompt\n", 352 | " + state[\"messages\"]\n", 353 | " )\n", 354 | " ]\n", 355 | " }" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "id": "9f05d11a", 361 | "metadata": {}, 362 | "source": [ 363 | "#### Tool handler node\n", 364 | "\n", 365 | "After the LLM makes a decision, we need to execute the chosen tool. \n", 366 | "\n", 367 | "The `tool_handler` node executes the tool. We can see that nodes can update the graph state to capture any important state changes, such as the classification decision." 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "id": "43eb6dc2", 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "def tool_handler(state: State):\n", 378 | " \"\"\"Performs the tool call.\"\"\"\n", 379 | "\n", 380 | " # List for tool messages\n", 381 | " result = []\n", 382 | " \n", 383 | " # Iterate through tool calls\n", 384 | " for tool_call in state[\"messages\"][-1].tool_calls:\n", 385 | " # Get the tool\n", 386 | " tool = tools_by_name[tool_call[\"name\"]]\n", 387 | " # Run it\n", 388 | " observation = tool.invoke(tool_call[\"args\"])\n", 389 | " # Create a tool message\n", 390 | " result.append({\"role\": \"tool\", \"content\" : observation, \"tool_call_id\": tool_call[\"id\"]})\n", 391 | " \n", 392 | " # Add it to our messages\n", 393 | " return {\"messages\": result}" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "id": "4721dede", 399 | "metadata": {}, 400 | "source": [ 401 | "#### Conditional Routing\n", 402 | "\n", 403 | "Our agent needs to decide when to continue using tools and when to stop. This conditional routing function directs the agent to either continue or terminate." 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "id": "7c7cbea7", 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "def should_continue(state: State) -> Literal[\"tool_handler\", \"__end__\"]:\n", 414 | " \"\"\"Route to tool handler, or end if Done tool called.\"\"\"\n", 415 | " \n", 416 | " # Get the last message\n", 417 | " messages = state[\"messages\"]\n", 418 | " last_message = messages[-1]\n", 419 | " \n", 420 | " # Check if it's a Done tool call\n", 421 | " if last_message.tool_calls:\n", 422 | " for tool_call in last_message.tool_calls: \n", 423 | " if tool_call[\"name\"] == \"Done\":\n", 424 | " return END\n", 425 | " else:\n", 426 | " return \"tool_handler\"" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "id": "6eb4ede8", 432 | "metadata": {}, 433 | "source": [ 434 | "#### Agent Graph\n", 435 | "\n", 436 | "Finally, we can assemble all components:" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "id": "f81df767", 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "from langgraph.graph import StateGraph, START, END\n", 447 | "from email_assistant.utils import show_graph\n", 448 | "\n", 449 | "# Build workflow\n", 450 | "overall_workflow = StateGraph(State)\n", 451 | "\n", 452 | "# Add nodes\n", 453 | "overall_workflow.add_node(\"llm_call\", llm_call)\n", 454 | "overall_workflow.add_node(\"tool_handler\", tool_handler)\n", 455 | "\n", 456 | "# Add edges\n", 457 | "overall_workflow.add_edge(START, \"llm_call\")\n", 458 | "overall_workflow.add_conditional_edges(\n", 459 | " \"llm_call\",\n", 460 | " should_continue,\n", 461 | " {\n", 462 | " \"tool_handler\": \"tool_handler\",\n", 463 | " END: END,\n", 464 | " },\n", 465 | ")\n", 466 | "overall_workflow.add_edge(\"tool_handler\", \"llm_call\")\n", 467 | "\n", 468 | "# Compile the agent\n", 469 | "agent = overall_workflow.compile()" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "id": "617f6373-bf48-44c2-ba33-000c9f22b067", 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "# View\n", 480 | "show_graph(agent)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "id": "dc8367c4", 486 | "metadata": {}, 487 | "source": [ 488 | "This creates a graph that:\n", 489 | "1. Starts with an LLM decision\n", 490 | "2. Conditionally routes to tool execution or termination\n", 491 | "3. After tool execution, returns to LLM for the next decision\n", 492 | "4. Repeats until completion or no tool is called\n" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "id": "b2b3406d-496d-43c9-942e-c5ce7e3a8321", 498 | "metadata": {}, 499 | "source": [ 500 | "### Combine workflow with our agent\n", 501 | "\n", 502 | "We can combine the router and the agent." 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "id": "697f2548-b5a5-4fb6-8aed-226369e53e25", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "overall_workflow = (\n", 513 | " StateGraph(State)\n", 514 | " .add_node(triage_router)\n", 515 | " .add_node(\"response_agent\", agent)\n", 516 | " .add_edge(START, \"triage_router\")\n", 517 | ").compile()" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "id": "2dd6dcc4-6346-4d41-ae36-61f3fc83b7a7", 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "show_graph(overall_workflow, xray=True)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "id": "2091d5cc", 533 | "metadata": {}, 534 | "source": [ 535 | "This is a higher-level composition where:\n", 536 | "1. First, the triage router analyzes the email\n", 537 | "2. If needed, the response agent handles crafting a response\n", 538 | "3. The workflow ends when either the triage decides no response is needed or the response agent completes" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "id": "070f18a6", 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "email_input = {\n", 549 | " \"author\": \"System Admin \",\n", 550 | " \"to\": \"Development Team \",\n", 551 | " \"subject\": \"Scheduled maintenance - database downtime\",\n", 552 | " \"email_thread\": \"Hi team,\\n\\nThis is a reminder that we'll be performing scheduled maintenance on the production database tonight from 2AM to 4AM EST. During this time, all database services will be unavailable.\\n\\nPlease plan your work accordingly and ensure no critical deployments are scheduled during this window.\\n\\nThanks,\\nSystem Admin Team\"\n", 553 | "}\n", 554 | "\n", 555 | "# Run the agent\n", 556 | "response = overall_workflow.invoke({\"email_input\": email_input})\n", 557 | "for m in response[\"messages\"]:\n", 558 | " m.pretty_print()" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "id": "7a50ae0a-7bd1-4e69-90be-781b1e77b4dd", 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "email_input = {\n", 569 | " \"author\": \"Alice Smith \",\n", 570 | " \"to\": \"John Doe \",\n", 571 | " \"subject\": \"Quick question about API documentation\",\n", 572 | " \"email_thread\": \"Hi John,\\nI was reviewing the API documentation for the new authentication service and noticed a few endpoints seem to be missing from the specs. Could you help clarify if this was intentional or if we should update the docs?\\nSpecifically, I'm looking at:\\n- /auth/refresh\\n- /auth/validate\\nThanks!\\nAlice\"\n", 573 | "}\n", 574 | "\n", 575 | "# Run the agent\n", 576 | "response = overall_workflow.invoke({\"email_input\": email_input})\n", 577 | "for m in response[\"messages\"]:\n", 578 | " m.pretty_print()" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "id": "f631f61f", 584 | "metadata": {}, 585 | "source": [ 586 | "## Testing with Local Deployment\n", 587 | "\n", 588 | "You can find the file for our agent in the `src/email_assistant` directory:\n", 589 | "\n", 590 | "* `src/email_assistant/email_assistant.py`\n", 591 | "\n", 592 | "You can test them locally in LangGraph Studio by running:\n", 593 | "\n", 594 | "```\n", 595 | "! langgraph dev\n", 596 | "```" 597 | ] 598 | }, 599 | { 600 | "cell_type": "markdown", 601 | "id": "12752016", 602 | "metadata": { 603 | "lines_to_next_cell": 0 604 | }, 605 | "source": [ 606 | "Example e-mail you can test:" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "id": "08ee005a", 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "{\n", 617 | " \"author\": \"Alice Smith \",\n", 618 | " \"to\": \"John Doe \",\n", 619 | " \"subject\": \"Quick question about API documentation\",\n", 620 | " \"email_thread\": \"Hi John,\\nI was reviewing the API documentation for the new authentication service and noticed a few endpoints seem to be missing from the specs. Could you help clarify if this was intentional or if we should update the docs?\\nSpecifically, I'm looking at:\\n- /auth/refresh\\n- /auth/validate\\nThanks!\\nAlice\"\n", 621 | "}" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "id": "d09e33b6", 627 | "metadata": {}, 628 | "source": [ 629 | "![studio-img](img/studio.png)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "id": "a3da05d3-78d1-40bf-b683-bcde7f76b0b3", 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "id": "0d195e21-f2c5-4762-a4f0-c8d7459df6d5", 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [] 647 | } 648 | ], 649 | "metadata": { 650 | "jupytext": { 651 | "cell_metadata_filter": "-all", 652 | "main_language": "python", 653 | "notebook_metadata_filter": "-all" 654 | }, 655 | "kernelspec": { 656 | "display_name": "Python 3 (ipykernel)", 657 | "language": "python", 658 | "name": "python3" 659 | }, 660 | "language_info": { 661 | "codemirror_mode": { 662 | "name": "ipython", 663 | "version": 3 664 | }, 665 | "file_extension": ".py", 666 | "mimetype": "text/x-python", 667 | "name": "python", 668 | "nbconvert_exporter": "python", 669 | "pygments_lexer": "ipython3", 670 | "version": "3.11.6" 671 | } 672 | }, 673 | "nbformat": 4, 674 | "nbformat_minor": 5 675 | } 676 | -------------------------------------------------------------------------------- /notebooks/evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "68e21aa1", 6 | "metadata": {}, 7 | "source": [ 8 | "# Evaluating Agents\n", 9 | "\n", 10 | "We have an email assistant that uses a router to triage emails and then passes the email to the agent for response generation. How can we be sure that it will work well in production? This is why testing is important: it guides our decisions about our agent architecture with quantifiable metrics like response quality, token usage, latency, or triage accuracy. [LangSmith](https://docs.smith.langchain.com/) offers two primary ways to test agents. \n", 11 | "\n", 12 | "![overview-img](img/overview_eval.png)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "4d7f7048", 18 | "metadata": {}, 19 | "source": [ 20 | "#### Load Environment Variables" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "c47d4c3d", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from dotenv import load_dotenv\n", 31 | "load_dotenv(\"../.env\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "2005c34d", 37 | "metadata": {}, 38 | "source": [ 39 | "## How to run Evaluations\n", 40 | "\n", 41 | "#### Pytest / Vitest\n", 42 | "\n", 43 | "[Pytest](https://docs.pytest.org/en/stable/) and Vitest are well known to many developers as a powerful tools for writing tests within the Python and JavaScript ecosystems. LangSmith integrates with these frameworks to allow you to write and run tests that log results to LangSmith. For this notebook, we'll use Pytest.\n", 44 | "* Pytest is a great way to get started for developers who are already familiar with their framework. \n", 45 | "* Pytest is great for more complex evaluations, where each agent test case requires specific checks and success criteria that are harder to generalize.\n", 46 | "\n", 47 | "#### LangSmith Datasets \n", 48 | "\n", 49 | "You can also create a dataset [in LangSmith](https://docs.smith.langchain.com/evaluation) and run our assistant against the dataset using the LangSmith evaluate API.\n", 50 | "* LangSmith datasets are great for teams who are collaboratively building out their test suite. \n", 51 | "* You can leverage production traces, annotation queues, synthetic data generation, and more, to add examples to an ever-growing golden dataset.\n", 52 | "* LangSmith datasets are great when you can define evaluators that can be applied to every test case in the dataset (ex. similarity, exact match accuracy, etc.)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "10b7c989", 58 | "metadata": {}, 59 | "source": [ 60 | "## Test Cases\n", 61 | "\n", 62 | "Testing often starts with defining the test cases, which can be a challenging process. In this case, we'll just define a set of example emails we want to handle along with a few things to test. You can see the test cases in `eval/email_dataset.py`, which contains the following:\n", 63 | "\n", 64 | "1. **Input Emails**: A collection of diverse email examples\n", 65 | "2. **Ground Truth Classifications**: `Respond`, `Notify`, `Ignore`\n", 66 | "3. **Expected Tool Calls**: Tools called for each email that requires a response\n", 67 | "4. **Response Criteria**: What makes a good response for emails requiring replies\n", 68 | "\n", 69 | "Note that we have both\n", 70 | "- End to end \"integration\" tests (e.g. Input Emails -> Agent -> Final Output vs Response Criteria)\n", 71 | "- Tests for specific steps in our workflow (e.g. Input Emails -> Agent -> Classification vs Ground Truth Classification)\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "f8fdc2b8", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "%cd ..\n", 82 | "%load_ext autoreload\n", 83 | "%autoreload 2\n", 84 | "\n", 85 | "from eval.email_dataset import email_inputs, expected_tool_calls, triage_outputs_list, response_criteria_list\n", 86 | "\n", 87 | "test_case_ix = 0\n", 88 | "\n", 89 | "print(\"Email Input:\", email_inputs[test_case_ix])\n", 90 | "print(\"Expected Triage Output:\", triage_outputs_list[test_case_ix])\n", 91 | "print(\"Expected Tool Calls:\", expected_tool_calls[test_case_ix])\n", 92 | "print(\"Response Criteria:\", response_criteria_list[test_case_ix])" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "2337bd7c", 98 | "metadata": {}, 99 | "source": [ 100 | "## Pytest Example\n", 101 | "\n", 102 | "Let's take a look at how we can write a test for a specific part of our workflow with Pytest. We will test whether our `email_assistant` makes the right tool calls when responding to the emails." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "ae92fe30", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "import pytest\n", 113 | "from eval.email_dataset import email_inputs, expected_tool_calls\n", 114 | "from email_assistant.utils import format_messages_string\n", 115 | "from email_assistant.email_assistant import email_assistant\n", 116 | "from email_assistant.utils import extract_tool_calls\n", 117 | "\n", 118 | "from langsmith import testing as t\n", 119 | "from dotenv import load_dotenv\n", 120 | "\n", 121 | "load_dotenv(\".env\", override=True)\n", 122 | "\n", 123 | "@pytest.mark.langsmith\n", 124 | "@pytest.mark.parametrize(\n", 125 | " \"email_input, expected_calls\",\n", 126 | " [ # Pick some examples with e-mail reply expected\n", 127 | " (email_inputs[0],expected_tool_calls[0]),\n", 128 | " (email_inputs[3],expected_tool_calls[3]),\n", 129 | " ],\n", 130 | ")\n", 131 | "def test_email_dataset_tool_calls(email_input, expected_calls):\n", 132 | " \"\"\"Test if email processing contains expected tool calls.\"\"\"\n", 133 | " # Run the email assistant\n", 134 | " messages = [{\"role\": \"user\", \"content\": str(email_input)}]\n", 135 | " result = email_assistant.invoke({\"messages\": messages})\n", 136 | " \n", 137 | " # Extract tool calls from messages list\n", 138 | " extracted_tool_calls = extract_tool_calls(result['messages'])\n", 139 | " \n", 140 | " # Check if all expected tool calls are in the extracted ones\n", 141 | " missing_calls = [call for call in expected_calls if call.lower() not in extracted_tool_calls]\n", 142 | " \n", 143 | " t.log_outputs({\n", 144 | " \"missing_calls\": missing_calls,\n", 145 | " \"extracted_tool_calls\": extracted_tool_calls,\n", 146 | " \"response\": format_messages_string(result['messages'])\n", 147 | " })\n", 148 | "\n", 149 | " # Test passes if no expected calls are missing\n", 150 | " assert len(missing_calls) == 0" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "700aba2a", 156 | "metadata": {}, 157 | "source": [ 158 | "You'll notice a few things. \n", 159 | "- To [run with Pytest and log test results to LangSmith](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest), we only need to add the `@pytest.mark.langsmith ` decorator to our function and place it in a file, as you see in `notebooks/test_tools.py`. This will log the test results to LangSmith.\n", 160 | "- Second, we can pass dataset examples to the test function as shown [here](https://docs.smith.langchain.com/evaluation/how_to_guides/pytest#parametrize-with-pytestmarkparametrize) via `@pytest.mark.parametrize`. \n", 161 | "\n", 162 | "#### Running Pytest\n", 163 | "We can run the test from the command line. We've defined the above code in a python file. From the project root, run:\n", 164 | "\n", 165 | "`! LANGSMITH_TEST_SUITE='Email assistant: Test Tools For Interrupt' pytest notebooks/test_tools.py`" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "53165e98", 171 | "metadata": {}, 172 | "source": [ 173 | "#### Viewing Experiment Result\n", 174 | "\n", 175 | "We can view the results in the LangSmith UI. The `assert len(missing_calls) == 0` is logged to the `Pass` column in LangSmith. The `log_outputs` are passed to the `Outputs` column and function arguments are passed to the `Inputs` column. Each input passed in `@pytest.mark.parametrize(` is a separate row logged to the `LANGSMITH_TEST_SUITE` project name in LangSmith, which is found under `Datasets & Experiments`.\n", 176 | "\n", 177 | "![Test Results](img/test_result.png)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "fd325e27", 183 | "metadata": {}, 184 | "source": [ 185 | "## LangSmith Datasets Example\n", 186 | "\n", 187 | "![overview-img](img/eval_detail.png)\n", 188 | "\n", 189 | "Let's take a look at how we can run evaluations with LangSmith datasets. In the previous example with Pytest, we evaluated the tool calling accuracy of the email assistant. Now, the dataset that we're going to evaluate here is specifically for the triage step of the email assistant, in classifying whether an email requires a response.\n", 190 | "\n", 191 | "#### Dataset Definition \n", 192 | "\n", 193 | "We can [create a dataset in LangSmith](https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically#create-a-dataset) with the LangSmith SDK. The below code creates a dataset with the test cases in the `eval/email_dataset.py` file." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "7ea997ac", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "from langsmith import Client\n", 204 | "import matplotlib.pyplot as plt\n", 205 | "\n", 206 | "from eval.email_dataset import examples_triage\n", 207 | "\n", 208 | "# Initialize LangSmith client\n", 209 | "client = Client()\n", 210 | "\n", 211 | "# Dataset name\n", 212 | "dataset_name = \"Interrupt Workshop: E-mail Triage Dataset\"\n", 213 | "\n", 214 | "# Create dataset if it doesn't exist\n", 215 | "if not client.has_dataset(dataset_name=dataset_name):\n", 216 | " dataset = client.create_dataset(\n", 217 | " dataset_name=dataset_name, \n", 218 | " description=\"A dataset of e-mails and their triage decisions.\"\n", 219 | " )\n", 220 | " # Add examples to the dataset\n", 221 | " client.create_examples(dataset_id=dataset.id, examples=examples_triage)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "0b2df606", 227 | "metadata": {}, 228 | "source": [ 229 | "#### Target Function\n", 230 | "\n", 231 | "The dataset has the following structure, with an e-mail input and a ground truth triage classification for the e-mail as output:\n", 232 | "\n", 233 | "```\n", 234 | "examples_triage = [\n", 235 | " {\n", 236 | " \"inputs\": {\"email_input\": email_input_1},\n", 237 | " \"outputs\": {\"classification\": triage_output_1}, # NOTE: This becomes the reference_output in the created dataset\n", 238 | " }, ...\n", 239 | "]\n", 240 | "```" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "id": "f7d7e83f-3006-4386-9230-786545c7b1a1", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "print(\"Dataset Example Input (inputs):\", examples_triage[0]['inputs'])" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "f292f070-7af6-4370-9338-e90bfd6b3d42", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "print(\"Dataset Example Reference Output (reference_outputs):\", examples_triage[0]['outputs'])" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "id": "8290e820", 266 | "metadata": {}, 267 | "source": [ 268 | "We define a function that takes the dataset inputs and passes them to our email assistant. The LangSmith [evaluate API](https://docs.smith.langchain.com/evaluation) passes the `inputs` dict to this function. This function then returns a dict with the agent's output. Because we are evaluating the triage step, we only need to return the classification decision. " 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "id": "0b9d1ded", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "def target_email_assistant(inputs: dict) -> dict:\n", 279 | " \"\"\"Process an email through the workflow-based email assistant.\"\"\"\n", 280 | " response = email_assistant.invoke({\"email_input\": inputs[\"email_input\"]})\n", 281 | " return {\"classification_decision\": response['classification_decision']}" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "id": "5ba6ec4c", 287 | "metadata": {}, 288 | "source": [ 289 | "#### Evaluator Function \n", 290 | "\n", 291 | "Now, we create an evaluator function. What do we want to evaluate? We have reference outputs in our dataset and agent outputs defined in the functions above.\n", 292 | "\n", 293 | "* Reference outputs: `\"reference_outputs\": {\"classification\": triage_output_1} ...`\n", 294 | "* Agent outputs: `\"outputs\": {\"classification_decision\": agent_output_1} ...`\n", 295 | "\n", 296 | "We want to evaluate if the agent's output matches the reference output. So we simply need a an evaluator function that compares the two, where `outputs` is the agent's output and `reference_outputs` is the reference output from the dataset." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "id": "4fee7532", 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "def classification_evaluator(outputs: dict, reference_outputs: dict) -> bool:\n", 307 | " \"\"\"Check if the answer exactly matches the expected answer.\"\"\"\n", 308 | " return outputs[\"classification_decision\"].lower() == reference_outputs[\"classification\"].lower()" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "id": "50fd2de9", 314 | "metadata": {}, 315 | "source": [ 316 | "### Running Evaluation\n", 317 | "\n", 318 | "Now, the question is: how are these things hooked together? The evaluate API takes care of it for us. It passes the `inputs` dict from our dataset the target function. It passes the `reference_outputs` dict from our dataset to the evaluator function. And it passes the `outputs` of our agent to the evaluator function. \n", 319 | "\n", 320 | "Note this is similar to what we did with Pytest: in Pytest, we passed in the dataset example inputs and reference outputs to the test function with `@pytest.mark.parametrize`." 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "id": "6807306d", 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# Set to true if you want to kick off evaluation\n", 331 | "run_expt = False\n", 332 | "if run_expt:\n", 333 | " experiment_results_workflow = client.evaluate(\n", 334 | " # Run agent \n", 335 | " target_email_assistant,\n", 336 | " # Dataset name \n", 337 | " data=dataset_name,\n", 338 | " # Evaluator\n", 339 | " evaluators=[classification_evaluator],\n", 340 | " # Name of the experiment\n", 341 | " experiment_prefix=\"E-mail assistant workflow\", \n", 342 | " # Number of concurrent evaluations\n", 343 | " max_concurrency=2, \n", 344 | " )" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "id": "76baff88", 350 | "metadata": {}, 351 | "source": [ 352 | "We can view the results from both experiments in the LangSmith UI.\n", 353 | "\n", 354 | "![Test Results](img/eval.png)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "id": "c5146b52", 360 | "metadata": {}, 361 | "source": [ 362 | "## LLM-as-Judge Evaluation\n", 363 | "\n", 364 | "We've shown unit tests for the triage step (using evaluate()) and tool calling (using Pytest). \n", 365 | "\n", 366 | "We'll showcase how you could use an LLM as a judge to evaluate our agent's execution against a set of success criteria. \n", 367 | "\n", 368 | "![types](img/eval_types.png)\n", 369 | "\n", 370 | "First, we define a structured output schema for our LLM grader that contains a grade and justification for the grade." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "id": "e1d342b8", 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "from pydantic import BaseModel, Field\n", 381 | "from langchain.chat_models import init_chat_model\n", 382 | "\n", 383 | "class CriteriaGrade(BaseModel):\n", 384 | " \"\"\"Score the response against specific criteria.\"\"\"\n", 385 | " grade: bool = Field(description=\"Does the response meet the provided criteria?\")\n", 386 | " justification: str = Field(description=\"The justification for the grade and score, including specific examples from the response.\")\n", 387 | "\n", 388 | "# Create a global LLM for evaluation to avoid recreating it for each test\n", 389 | "criteria_eval_llm = init_chat_model(\"openai:gpt-4o\")\n", 390 | "criteria_eval_structured_llm = criteria_eval_llm.with_structured_output(CriteriaGrade)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "id": "bec02b18", 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "email_input = email_inputs[0]\n", 401 | "print(\"Email Input:\", email_input)\n", 402 | "success_criteria = response_criteria_list[0]\n", 403 | "print(\"Success Criteria:\", success_criteria)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "id": "38390ccd", 409 | "metadata": {}, 410 | "source": [ 411 | "Our Email Assistant is invoked with the email input and the response is formatted into a string. These are all then passed to the LLM grader to receive a grade and justification for the grade." 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "id": "cbff28fc", 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "response = email_assistant.invoke({\"email_input\": email_input})" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "id": "d64619fb", 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "from eval.prompts import RESPONSE_CRITERIA_SYSTEM_PROMPT\n", 432 | "\n", 433 | "all_messages_str = format_messages_string(response['messages'])\n", 434 | "eval_result = criteria_eval_structured_llm.invoke([\n", 435 | " {\"role\": \"system\",\n", 436 | " \"content\": RESPONSE_CRITERIA_SYSTEM_PROMPT},\n", 437 | " {\"role\": \"user\",\n", 438 | " \"content\": f\"\"\"\\n\\n Response criteria: {success_criteria} \\n\\n Assistant's response: \\n\\n {all_messages_str} \\n\\n Evaluate whether the assistant's response meets the criteria and provide justification for your evaluation.\"\"\"}\n", 439 | " ])\n", 440 | "\n", 441 | "eval_result" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "64275647-6fdb-4bf3-806b-4dbc770cbd6f", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "RESPONSE_CRITERIA_SYSTEM_PROMPT" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "id": "7994952c", 457 | "metadata": {}, 458 | "source": [ 459 | "We can see that the LLM grader returns an eval result with a schema matching our `CriteriaGrade` base model." 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "id": "0b44111d", 465 | "metadata": {}, 466 | "source": [ 467 | "## Running against a Larger Test Suite\n", 468 | "Now that we've seen how to evaluate our agent using Pytest and evaluate(), and seen an example of using an LLM as a judge, we can use evaluations over a bigger test suite to get a better sense of how our agent performs over a wider variety of examples." 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "id": "9280d5ae-3070-4131-8763-454073176081", 474 | "metadata": {}, 475 | "source": [ 476 | "Let's run our email_assistant against a larger test suite.\n", 477 | "```\n", 478 | "! LANGSMITH_TEST_SUITE='Email assistant: Test Full Response Interrupt' LANGSMITH_EXPERIMENT='email_assistant' pytest tests/test_response.py --agent-module email_assistant\n", 479 | "```\n", 480 | "\n", 481 | "In `test_response.py`, you can see a few things. \n", 482 | "\n", 483 | "We pass our dataset examples into functions that will run pytest and log to our `LANGSMITH_TEST_SUITE`:\n", 484 | "\n", 485 | "```\n", 486 | "# Reference output key\n", 487 | "@pytest.mark.langsmith(output_keys=[\"criteria\"])\n", 488 | "# Variable names and a list of tuples with the test cases\n", 489 | "# Each test case is (email_input, email_name, criteria, expected_calls)\n", 490 | "@pytest.mark.parametrize(\"email_input,email_name,criteria,expected_calls\",create_response_test_cases())\n", 491 | "def test_response_criteria_evaluation(email_input, email_name, criteria, expected_calls):\n", 492 | "```\n", 493 | "\n", 494 | "We use LLM-as-judge with a grading schema:\n", 495 | "```\n", 496 | "class CriteriaGrade(BaseModel):\n", 497 | " \"\"\"Score the response against specific criteria.\"\"\"\n", 498 | " grade: bool = Field(description=\"Does the response meet the provided criteria?\")\n", 499 | " justification: str = Field(description=\"The justification for the grade and score, including specific examples from the response.\")\n", 500 | "```\n", 501 | "\n", 502 | "\n", 503 | "We evaluate the agent response relative to the criteria:\n", 504 | "```\n", 505 | " # Evaluate against criteria\n", 506 | " eval_result = criteria_eval_structured_llm.invoke([\n", 507 | " {\"role\": \"system\",\n", 508 | " \"content\": RESPONSE_CRITERIA_SYSTEM_PROMPT},\n", 509 | " {\"role\": \"user\",\n", 510 | " \"content\": f\"\"\"\\n\\n Response criteria: {criteria} \\n\\n Assistant's response: \\n\\n {all_messages_str} \\n\\n Evaluate whether the assistant's response meets the criteria and provide justification for your evaluation.\"\"\"}\n", 511 | " ])\n", 512 | "```" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "id": "ca836fbf", 518 | "metadata": {}, 519 | "source": [ 520 | "Now let's take a look at this experiment in the LangSmith UI and look into what our agent did well, and what it could improve on.\n", 521 | "\n", 522 | "#### Getting Results\n", 523 | "\n", 524 | "We can also get the results of the evaluation by reading the tracing project associated with our experiment. This is great for creating custom visualizations of our agent's performance." 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "id": "70b655f8", 531 | "metadata": { 532 | "lines_to_next_cell": 0 533 | }, 534 | "outputs": [], 535 | "source": [ 536 | "# TODO: Copy your experiment name here\n", 537 | "experiment_name = \"email_assistant:8286b3b8\"\n", 538 | "# Set this to load expt results\n", 539 | "load_expt = False\n", 540 | "if load_expt:\n", 541 | " email_assistant_experiment_results = client.read_project(project_name=experiment_name, include_stats=True)\n", 542 | " print(\"Latency p50:\", email_assistant_experiment_results.latency_p50)\n", 543 | " print(\"Latency p99:\", email_assistant_experiment_results.latency_p99)\n", 544 | " print(\"Token Usage:\", email_assistant_experiment_results.total_tokens)\n", 545 | " print(\"Feedback Stats:\", email_assistant_experiment_results.feedback_stats)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "id": "0ccdfaa6", 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [] 555 | } 556 | ], 557 | "metadata": { 558 | "jupytext": { 559 | "cell_metadata_filter": "-all", 560 | "main_language": "python", 561 | "notebook_metadata_filter": "-all" 562 | }, 563 | "kernelspec": { 564 | "display_name": "Python 3 (ipykernel)", 565 | "language": "python", 566 | "name": "python3" 567 | }, 568 | "language_info": { 569 | "codemirror_mode": { 570 | "name": "ipython", 571 | "version": 3 572 | }, 573 | "file_extension": ".py", 574 | "mimetype": "text/x-python", 575 | "name": "python", 576 | "nbconvert_exporter": "python", 577 | "pygments_lexer": "ipython3", 578 | "version": "3.11.6" 579 | } 580 | }, 581 | "nbformat": 4, 582 | "nbformat_minor": 5 583 | } 584 | -------------------------------------------------------------------------------- /notebooks/img/HITL_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/HITL_flow.png -------------------------------------------------------------------------------- /notebooks/img/HITL_flow_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/HITL_flow_memory.png -------------------------------------------------------------------------------- /notebooks/img/HITL_flow_triage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/HITL_flow_triage.png -------------------------------------------------------------------------------- /notebooks/img/agent-inbox-draft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/agent-inbox-draft.png -------------------------------------------------------------------------------- /notebooks/img/agent-inbox-edit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/agent-inbox-edit.png -------------------------------------------------------------------------------- /notebooks/img/agent-inbox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/agent-inbox.png -------------------------------------------------------------------------------- /notebooks/img/agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/agent.png -------------------------------------------------------------------------------- /notebooks/img/agent_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/agent_example.png -------------------------------------------------------------------------------- /notebooks/img/agent_loop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/agent_loop.png -------------------------------------------------------------------------------- /notebooks/img/agent_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/agent_workflow.png -------------------------------------------------------------------------------- /notebooks/img/checkpoints.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/checkpoints.png -------------------------------------------------------------------------------- /notebooks/img/ecosystem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/ecosystem.png -------------------------------------------------------------------------------- /notebooks/img/email_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/email_workflow.png -------------------------------------------------------------------------------- /notebooks/img/eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/eval.png -------------------------------------------------------------------------------- /notebooks/img/eval_detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/eval_detail.png -------------------------------------------------------------------------------- /notebooks/img/eval_types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/eval_types.png -------------------------------------------------------------------------------- /notebooks/img/hitl_schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/hitl_schematic.png -------------------------------------------------------------------------------- /notebooks/img/img_helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib.pyplot as plt 3 | from IPython.display import display, Image 4 | 5 | def show_image(filename): 6 | """ 7 | Display an image with proper path handling for both local and GitHub viewing. 8 | 9 | Args: 10 | filename: Image filename (without path) 11 | """ 12 | # The image directory relative to this file 13 | img_dir = "img" 14 | 15 | # Full path to the image 16 | img_path = os.path.join(img_dir, filename) 17 | 18 | # Display the image 19 | display(Image(img_path)) -------------------------------------------------------------------------------- /notebooks/img/langgraph_studio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/langgraph_studio.png -------------------------------------------------------------------------------- /notebooks/img/memory-studio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/memory-studio.png -------------------------------------------------------------------------------- /notebooks/img/nodes_edges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/nodes_edges.png -------------------------------------------------------------------------------- /notebooks/img/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/overview.png -------------------------------------------------------------------------------- /notebooks/img/overview_agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/overview_agent.png -------------------------------------------------------------------------------- /notebooks/img/overview_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/overview_eval.png -------------------------------------------------------------------------------- /notebooks/img/overview_hitl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/overview_hitl.png -------------------------------------------------------------------------------- /notebooks/img/overview_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/overview_memory.png -------------------------------------------------------------------------------- /notebooks/img/router.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/router.png -------------------------------------------------------------------------------- /notebooks/img/short-vs-long.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/short-vs-long.png -------------------------------------------------------------------------------- /notebooks/img/studio-interrupt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/studio-interrupt.png -------------------------------------------------------------------------------- /notebooks/img/studio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/studio.png -------------------------------------------------------------------------------- /notebooks/img/test_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/test_result.png -------------------------------------------------------------------------------- /notebooks/img/tool_call.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/tool_call.png -------------------------------------------------------------------------------- /notebooks/img/tool_call_detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/tool_call_detail.png -------------------------------------------------------------------------------- /notebooks/img/workflow_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/workflow_example.png -------------------------------------------------------------------------------- /notebooks/img/workflow_v_agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/langchain-ai/agents-from-scratch/46e1e36640202bf17a3b52db905563044fa4c737/notebooks/img/workflow_v_agent.png -------------------------------------------------------------------------------- /notebooks/test_tools.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | from pathlib import Path 4 | 5 | # Add project root to Python path 6 | project_root = Path(__file__).parent.parent 7 | sys.path.append(str(project_root)) 8 | 9 | import pytest 10 | from eval.email_dataset import email_inputs, expected_tool_calls 11 | from email_assistant.utils import format_messages_string 12 | from email_assistant.email_assistant import email_assistant 13 | from email_assistant.utils import extract_tool_calls 14 | from langsmith import testing as t 15 | from dotenv import load_dotenv 16 | 17 | load_dotenv(".env") 18 | 19 | @pytest.mark.langsmith 20 | @pytest.mark.parametrize( 21 | "email_input, expected_calls", 22 | [ # Pick some examples with e-mail reply expected 23 | (email_inputs[0],expected_tool_calls[0]), 24 | (email_inputs[3],expected_tool_calls[3]), 25 | ], 26 | ) 27 | def test_email_dataset_tool_calls(email_input, expected_calls): 28 | """Test if email processing contains expected tool calls.""" 29 | # Run the email assistant 30 | result = email_assistant.invoke({"email_input": email_input}) 31 | 32 | # Extract tool calls from messages list 33 | extracted_tool_calls = extract_tool_calls(result['messages']) 34 | 35 | # Check if all expected tool calls are in the extracted ones 36 | missing_calls = [call for call in expected_calls if call.lower() not in extracted_tool_calls] 37 | 38 | t.log_outputs({ 39 | "missing_calls": missing_calls, 40 | "extracted_tool_calls": extracted_tool_calls, 41 | "response": format_messages_string(result['messages']) 42 | }) 43 | 44 | # Test passes if no expected calls are missing 45 | assert len(missing_calls) == 0 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "interrupt_workshop" 3 | version = "0.1.0" 4 | description = "Workshop for Interrupt Conference" 5 | requires-python = ">=3.11" 6 | dependencies = [ 7 | "langchain>=0.3.9", 8 | "langchain-core>=0.3.59", 9 | "langchain-openai", 10 | "langgraph>=0.4.2", 11 | "langsmith[pytest]>=0.3.4", 12 | "pandas", 13 | "matplotlib", 14 | "pytest", 15 | "pytest-xdist", 16 | "jupyter", 17 | "langgraph-cli[inmem]", 18 | "google-api-python-client>=2.128.0", 19 | "google-auth-oauthlib", 20 | "google-auth-httplib2", 21 | "python-dotenv", 22 | "pyppeteer", 23 | "html2text", 24 | ] 25 | 26 | [project.optional-dependencies] 27 | dev = ["mypy>=1.11.1", "ruff>=0.6.1"] 28 | 29 | [build-system] 30 | requires = ["setuptools>=73.0.0", "wheel"] 31 | build-backend = "setuptools.build_meta" 32 | 33 | [tool.setuptools] 34 | packages = ["email_assistant"] 35 | 36 | [tool.setuptools.package-dir] 37 | "email_assistant" = "src/email_assistant" 38 | 39 | [tool.setuptools.package-data] 40 | "*" = ["py.typed"] 41 | 42 | [tool.ruff] 43 | lint.select = [ 44 | "E", # pycodestyle 45 | "F", # pyflakes 46 | "I", # isort 47 | "D", # pydocstyle 48 | "D401", # First line should be in imperative mood 49 | "T201", 50 | "UP", 51 | ] 52 | lint.ignore = [ 53 | "UP006", 54 | "UP007", 55 | "UP035", 56 | "D417", 57 | "E501", 58 | ] 59 | 60 | [tool.ruff.lint.per-file-ignores] 61 | "tests/*" = ["D", "UP"] 62 | 63 | [tool.ruff.lint.pydocstyle] 64 | convention = "google" 65 | -------------------------------------------------------------------------------- /src/email_assistant/__init__.py: -------------------------------------------------------------------------------- 1 | version = "0.1.0" -------------------------------------------------------------------------------- /src/email_assistant/configuration.py: -------------------------------------------------------------------------------- 1 | """Define the configurable parameters for the agent.""" 2 | 3 | import os 4 | from dataclasses import dataclass, fields 5 | from typing import Any, Optional 6 | 7 | from langchain_core.runnables import RunnableConfig 8 | 9 | @dataclass(kw_only=True) 10 | class Configuration: 11 | """Placeholder for configuration.""" 12 | 13 | @classmethod 14 | def from_runnable_config( 15 | cls, config: Optional[RunnableConfig] = None 16 | ) -> "Configuration": 17 | """Create a Configuration instance from a RunnableConfig.""" 18 | configurable = ( 19 | config["configurable"] if config and "configurable" in config else {} 20 | ) 21 | values: dict[str, Any] = { 22 | f.name: os.environ.get(f.name.upper(), configurable.get(f.name)) 23 | for f in fields(cls) 24 | if f.init 25 | } 26 | 27 | return cls(**{k: v for k, v in values.items() if v}) -------------------------------------------------------------------------------- /src/email_assistant/cron.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import asyncio 4 | from typing import Dict, Any, TypedDict 5 | from dataclasses import dataclass, field 6 | from langgraph.graph import StateGraph, START, END 7 | from email_assistant.tools.gmail.run_ingest import fetch_and_process_emails 8 | 9 | @dataclass(kw_only=True) 10 | class JobKickoff: 11 | """State for the email ingestion cron job""" 12 | email: str 13 | minutes_since: int = 60 14 | graph_name: str = "email_assistant_hitl_memory_gmail" 15 | url: str = "http://127.0.0.1:2024" 16 | include_read: bool = False 17 | rerun: bool = False 18 | early: bool = False 19 | skip_filters: bool = False 20 | 21 | async def main(state: JobKickoff): 22 | """Run the email ingestion process""" 23 | print(f"Kicking off job to fetch emails from the past {state.minutes_since} minutes") 24 | print(f"Email: {state.email}") 25 | print(f"URL: {state.url}") 26 | print(f"Graph name: {state.graph_name}") 27 | 28 | try: 29 | # Convert state to args object for fetch_and_process_emails 30 | class Args: 31 | def __init__(self, **kwargs): 32 | for key, value in kwargs.items(): 33 | setattr(self, key, value) 34 | print(f"Created Args with attributes: {dir(self)}") 35 | 36 | args = Args( 37 | email=state.email, 38 | minutes_since=state.minutes_since, 39 | graph_name=state.graph_name, 40 | url=state.url, 41 | include_read=state.include_read, 42 | rerun=state.rerun, 43 | early=state.early, 44 | skip_filters=state.skip_filters 45 | ) 46 | 47 | # Print email and URL to verify they're being passed correctly 48 | print(f"Args email: {args.email}") 49 | print(f"Args url: {args.url}") 50 | 51 | # Run the ingestion process 52 | print("Starting fetch_and_process_emails...") 53 | result = await fetch_and_process_emails(args) 54 | print(f"fetch_and_process_emails returned: {result}") 55 | 56 | # Return the result status 57 | return {"status": "success" if result == 0 else "error", "exit_code": result} 58 | except Exception as e: 59 | import traceback 60 | print(f"Error in cron job: {str(e)}") 61 | print(traceback.format_exc()) 62 | return {"status": "error", "error": str(e)} 63 | 64 | # Build the graph 65 | graph = StateGraph(JobKickoff) 66 | graph.add_node("ingest_emails", main) 67 | graph.set_entry_point("ingest_emails") 68 | graph = graph.compile() -------------------------------------------------------------------------------- /src/email_assistant/email_assistant.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | from langchain.chat_models import init_chat_model 4 | 5 | from src.email_assistant.tools import get_tools, get_tools_by_name 6 | from src.email_assistant.tools.default.prompt_templates import AGENT_TOOLS_PROMPT 7 | from src.email_assistant.prompts import triage_system_prompt, triage_user_prompt, agent_system_prompt, default_background, default_triage_instructions, default_response_preferences, default_cal_preferences 8 | from src.email_assistant.schemas import State, RouterSchema, StateInput 9 | from src.email_assistant.utils import parse_email, format_email_markdown 10 | 11 | from langgraph.graph import StateGraph, START, END 12 | from langgraph.types import Command 13 | from dotenv import load_dotenv 14 | load_dotenv(".env") 15 | 16 | # Get tools 17 | tools = get_tools() 18 | tools_by_name = get_tools_by_name(tools) 19 | 20 | # Initialize the LLM for use with router / structured output 21 | llm = init_chat_model("openai:gpt-4.1", temperature=0.0) 22 | llm_router = llm.with_structured_output(RouterSchema) 23 | 24 | # Initialize the LLM, enforcing tool use (of any available tools) for agent 25 | llm = init_chat_model("openai:gpt-4.1", temperature=0.0) 26 | llm_with_tools = llm.bind_tools(tools, tool_choice="any") 27 | 28 | # Nodes 29 | def llm_call(state: State): 30 | """LLM decides whether to call a tool or not""" 31 | 32 | return { 33 | "messages": [ 34 | llm_with_tools.invoke( 35 | [ 36 | {"role": "system", "content": agent_system_prompt.format( 37 | tools_prompt=AGENT_TOOLS_PROMPT, 38 | background=default_background, 39 | response_preferences=default_response_preferences, 40 | cal_preferences=default_cal_preferences) 41 | }, 42 | 43 | ] 44 | + state["messages"] 45 | ) 46 | ] 47 | } 48 | 49 | def tool_node(state: State): 50 | """Performs the tool call""" 51 | 52 | result = [] 53 | for tool_call in state["messages"][-1].tool_calls: 54 | tool = tools_by_name[tool_call["name"]] 55 | observation = tool.invoke(tool_call["args"]) 56 | result.append({"role": "tool", "content" : observation, "tool_call_id": tool_call["id"]}) 57 | return {"messages": result} 58 | 59 | # Conditional edge function 60 | def should_continue(state: State) -> Literal["Action", "__end__"]: 61 | """Route to Action, or end if Done tool called""" 62 | messages = state["messages"] 63 | last_message = messages[-1] 64 | if last_message.tool_calls: 65 | for tool_call in last_message.tool_calls: 66 | if tool_call["name"] == "Done": 67 | return END 68 | else: 69 | return "Action" 70 | 71 | # Build workflow 72 | agent_builder = StateGraph(State) 73 | 74 | # Add nodes 75 | agent_builder.add_node("llm_call", llm_call) 76 | agent_builder.add_node("environment", tool_node) 77 | 78 | # Add edges to connect nodes 79 | agent_builder.add_edge(START, "llm_call") 80 | agent_builder.add_conditional_edges( 81 | "llm_call", 82 | should_continue, 83 | { 84 | # Name returned by should_continue : Name of next node to visit 85 | "Action": "environment", 86 | END: END, 87 | }, 88 | ) 89 | agent_builder.add_edge("environment", "llm_call") 90 | 91 | # Compile the agent 92 | agent = agent_builder.compile() 93 | 94 | def triage_router(state: State) -> Command[Literal["response_agent", "__end__"]]: 95 | """Analyze email content to decide if we should respond, notify, or ignore. 96 | 97 | The triage step prevents the assistant from wasting time on: 98 | - Marketing emails and spam 99 | - Company-wide announcements 100 | - Messages meant for other teams 101 | """ 102 | author, to, subject, email_thread = parse_email(state["email_input"]) 103 | system_prompt = triage_system_prompt.format( 104 | background=default_background, 105 | triage_instructions=default_triage_instructions 106 | ) 107 | 108 | user_prompt = triage_user_prompt.format( 109 | author=author, to=to, subject=subject, email_thread=email_thread 110 | ) 111 | 112 | # Create email markdown for Agent Inbox in case of notification 113 | email_markdown = format_email_markdown(subject, author, to, email_thread) 114 | 115 | # Run the router LLM 116 | result = llm_router.invoke( 117 | [ 118 | {"role": "system", "content": system_prompt}, 119 | {"role": "user", "content": user_prompt}, 120 | ] 121 | ) 122 | 123 | # Decision 124 | classification = result.classification 125 | 126 | if classification == "respond": 127 | print("📧 Classification: RESPOND - This email requires a response") 128 | goto = "response_agent" 129 | # Add the email to the messages 130 | update = { 131 | "classification_decision": result.classification, 132 | "messages": [{"role": "user", 133 | "content": f"Respond to the email: {email_markdown}" 134 | }], 135 | } 136 | elif result.classification == "ignore": 137 | print("🚫 Classification: IGNORE - This email can be safely ignored") 138 | update = { 139 | "classification_decision": result.classification, 140 | } 141 | goto = END 142 | elif result.classification == "notify": 143 | # If real life, this would do something else 144 | print("🔔 Classification: NOTIFY - This email contains important information") 145 | update = { 146 | "classification_decision": result.classification, 147 | } 148 | goto = END 149 | else: 150 | raise ValueError(f"Invalid classification: {result.classification}") 151 | return Command(goto=goto, update=update) 152 | 153 | # Build workflow 154 | overall_workflow = ( 155 | StateGraph(State, input=StateInput) 156 | .add_node(triage_router) 157 | .add_node("response_agent", agent) 158 | .add_edge(START, "triage_router") 159 | ) 160 | 161 | email_assistant = overall_workflow.compile() -------------------------------------------------------------------------------- /src/email_assistant/email_assistant_hitl.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | from langchain.chat_models import init_chat_model 4 | 5 | from langgraph.graph import StateGraph, START, END 6 | from langgraph.types import interrupt, Command 7 | 8 | from src.email_assistant.tools import get_tools, get_tools_by_name 9 | from src.email_assistant.tools.default.prompt_templates import HITL_TOOLS_PROMPT 10 | from src.email_assistant.prompts import triage_system_prompt, triage_user_prompt, agent_system_prompt_hitl, default_background, default_triage_instructions, default_response_preferences, default_cal_preferences 11 | from src.email_assistant.schemas import State, RouterSchema, StateInput 12 | from src.email_assistant.utils import parse_email, format_for_display, format_email_markdown 13 | from dotenv import load_dotenv 14 | 15 | load_dotenv(".env") 16 | 17 | # Get tools 18 | tools = get_tools(["write_email", "schedule_meeting", "check_calendar_availability", "Question", "Done"]) 19 | tools_by_name = get_tools_by_name(tools) 20 | 21 | # Initialize the LLM for use with router / structured output 22 | llm = init_chat_model("openai:gpt-4.1", temperature=0.0) 23 | llm_router = llm.with_structured_output(RouterSchema) 24 | 25 | # Initialize the LLM, enforcing tool use (of any available tools) for agent 26 | llm = init_chat_model("openai:gpt-4.1", temperature=0.0) 27 | llm_with_tools = llm.bind_tools(tools, tool_choice="required") 28 | 29 | # Nodes 30 | def triage_router(state: State) -> Command[Literal["triage_interrupt_handler", "response_agent", "__end__"]]: 31 | """Analyze email content to decide if we should respond, notify, or ignore. 32 | 33 | The triage step prevents the assistant from wasting time on: 34 | - Marketing emails and spam 35 | - Company-wide announcements 36 | - Messages meant for other teams 37 | """ 38 | 39 | # Parse the email input 40 | author, to, subject, email_thread = parse_email(state["email_input"]) 41 | user_prompt = triage_user_prompt.format( 42 | author=author, to=to, subject=subject, email_thread=email_thread 43 | ) 44 | 45 | # Create email markdown for Agent Inbox in case of notification 46 | email_markdown = format_email_markdown(subject, author, to, email_thread) 47 | 48 | # Format system prompt with background and triage instructions 49 | system_prompt = triage_system_prompt.format( 50 | background=default_background, 51 | triage_instructions=default_triage_instructions 52 | ) 53 | 54 | # Run the router LLM 55 | result = llm_router.invoke( 56 | [ 57 | {"role": "system", "content": system_prompt}, 58 | {"role": "user", "content": user_prompt}, 59 | ] 60 | ) 61 | 62 | # Decision 63 | classification = result.classification 64 | 65 | # Process the classification decision 66 | if classification == "respond": 67 | print("📧 Classification: RESPOND - This email requires a response") 68 | # Next node 69 | goto = "response_agent" 70 | # Update the state 71 | update = { 72 | "classification_decision": result.classification, 73 | "messages": [{"role": "user", 74 | "content": f"Respond to the email: {email_markdown}" 75 | }], 76 | } 77 | elif classification == "ignore": 78 | print("🚫 Classification: IGNORE - This email can be safely ignored") 79 | 80 | # Next node 81 | goto = END 82 | # Update the state 83 | update = { 84 | "classification_decision": classification, 85 | } 86 | 87 | elif classification == "notify": 88 | print("🔔 Classification: NOTIFY - This email contains important information") 89 | 90 | # Next node 91 | goto = "triage_interrupt_handler" 92 | # Update the state 93 | update = { 94 | "classification_decision": classification, 95 | } 96 | 97 | else: 98 | raise ValueError(f"Invalid classification: {classification}") 99 | return Command(goto=goto, update=update) 100 | 101 | def triage_interrupt_handler(state: State) -> Command[Literal["response_agent", "__end__"]]: 102 | """Handles interrupts from the triage step""" 103 | 104 | # Parse the email input 105 | author, to, subject, email_thread = parse_email(state["email_input"]) 106 | 107 | # Create email markdown for Agent Inbox in case of notification 108 | email_markdown = format_email_markdown(subject, author, to, email_thread) 109 | 110 | # Create messages 111 | messages = [{"role": "user", 112 | "content": f"Email to notify user about: {email_markdown}" 113 | }] 114 | 115 | # Create interrupt for Agent Inbox 116 | request = { 117 | "action_request": { 118 | "action": f"Email Assistant: {state['classification_decision']}", 119 | "args": {} 120 | }, 121 | "config": { 122 | "allow_ignore": True, 123 | "allow_respond": True, 124 | "allow_edit": False, 125 | "allow_accept": False, 126 | }, 127 | # Email to show in Agent Inbox 128 | "description": email_markdown, 129 | } 130 | 131 | # Agent Inbox responds with a list 132 | response = interrupt([request])[0] 133 | 134 | # If user provides feedback, go to response agent and use feedback to respond to email 135 | if response["type"] == "response": 136 | # Add feedback to messages 137 | user_input = response["args"] 138 | # Used by the response agent 139 | messages.append({"role": "user", 140 | "content": f"User wants to reply to the email. Use this feedback to respond: {user_input}" 141 | }) 142 | # Go to response agent 143 | goto = "response_agent" 144 | 145 | # If user ignores email, go to END 146 | elif response["type"] == "ignore": 147 | goto = END 148 | 149 | # Catch all other responses 150 | else: 151 | raise ValueError(f"Invalid response: {response}") 152 | 153 | # Update the state 154 | update = { 155 | "messages": messages, 156 | } 157 | 158 | return Command(goto=goto, update=update) 159 | 160 | def llm_call(state: State): 161 | """LLM decides whether to call a tool or not""" 162 | 163 | return { 164 | "messages": [ 165 | llm_with_tools.invoke( 166 | [ 167 | {"role": "system", "content": agent_system_prompt_hitl.format( 168 | tools_prompt=HITL_TOOLS_PROMPT, 169 | background=default_background, 170 | response_preferences=default_response_preferences, 171 | cal_preferences=default_cal_preferences 172 | )} 173 | ] 174 | + state["messages"] 175 | ) 176 | ] 177 | } 178 | 179 | def interrupt_handler(state: State) -> Command[Literal["llm_call", "__end__"]]: 180 | """Creates an interrupt for human review of tool calls""" 181 | 182 | # Store messages 183 | result = [] 184 | 185 | # Go to the LLM call node next 186 | goto = "llm_call" 187 | 188 | # Iterate over the tool calls in the last message 189 | for tool_call in state["messages"][-1].tool_calls: 190 | 191 | # Allowed tools for HITL 192 | hitl_tools = ["write_email", "schedule_meeting", "Question"] 193 | 194 | # If tool is not in our HITL list, execute it directly without interruption 195 | if tool_call["name"] not in hitl_tools: 196 | 197 | # Execute search_memory and other tools without interruption 198 | tool = tools_by_name[tool_call["name"]] 199 | observation = tool.invoke(tool_call["args"]) 200 | result.append({"role": "tool", "content": observation, "tool_call_id": tool_call["id"]}) 201 | continue 202 | 203 | # Get original email from email_input in state 204 | email_input = state["email_input"] 205 | author, to, subject, email_thread = parse_email(email_input) 206 | original_email_markdown = format_email_markdown(subject, author, to, email_thread) 207 | 208 | # Format tool call for display and prepend the original email 209 | tool_display = format_for_display(state, tool_call) 210 | description = original_email_markdown + tool_display 211 | 212 | # Configure what actions are allowed in Agent Inbox 213 | if tool_call["name"] == "write_email": 214 | config = { 215 | "allow_ignore": True, 216 | "allow_respond": True, 217 | "allow_edit": True, 218 | "allow_accept": True, 219 | } 220 | elif tool_call["name"] == "schedule_meeting": 221 | config = { 222 | "allow_ignore": True, 223 | "allow_respond": True, 224 | "allow_edit": True, 225 | "allow_accept": True, 226 | } 227 | elif tool_call["name"] == "Question": 228 | config = { 229 | "allow_ignore": True, 230 | "allow_respond": True, 231 | "allow_edit": False, 232 | "allow_accept": False, 233 | } 234 | else: 235 | raise ValueError(f"Invalid tool call: {tool_call['name']}") 236 | 237 | # Create the interrupt request 238 | request = { 239 | "action_request": { 240 | "action": tool_call["name"], 241 | "args": tool_call["args"] 242 | }, 243 | "config": config, 244 | "description": description, 245 | } 246 | 247 | # Send to Agent Inbox and wait for response 248 | response = interrupt([request])[0] 249 | 250 | # Handle the responses 251 | if response["type"] == "accept": 252 | 253 | # Execute the tool with original args 254 | tool = tools_by_name[tool_call["name"]] 255 | observation = tool.invoke(tool_call["args"]) 256 | result.append({"role": "tool", "content": observation, "tool_call_id": tool_call["id"]}) 257 | 258 | elif response["type"] == "edit": 259 | 260 | # Tool selection 261 | tool = tools_by_name[tool_call["name"]] 262 | 263 | # Get edited args from Agent Inbox 264 | edited_args = response["args"]["args"] 265 | 266 | # Update the AI message's tool call with edited content (reference to the message in the state) 267 | ai_message = state["messages"][-1] # Get the most recent message from the state 268 | current_id = tool_call["id"] # Store the ID of the tool call being edited 269 | 270 | # Create a new list of tool calls by filtering out the one being edited and adding the updated version 271 | # This avoids modifying the original list directly (immutable approach) 272 | updated_tool_calls = [tc for tc in ai_message.tool_calls if tc["id"] != current_id] + [ 273 | {"type": "tool_call", "name": tool_call["name"], "args": edited_args, "id": current_id} 274 | ] 275 | 276 | # Create a new copy of the message with updated tool calls rather than modifying the original 277 | # This ensures state immutability and prevents side effects in other parts of the code 278 | result.append(ai_message.model_copy(update={"tool_calls": updated_tool_calls})) 279 | 280 | # Update the write_email tool call with the edited content from Agent Inbox 281 | if tool_call["name"] == "write_email": 282 | 283 | # Execute the tool with edited args 284 | observation = tool.invoke(edited_args) 285 | 286 | # Add only the tool response message 287 | result.append({"role": "tool", "content": observation, "tool_call_id": current_id}) 288 | 289 | # Update the schedule_meeting tool call with the edited content from Agent Inbox 290 | elif tool_call["name"] == "schedule_meeting": 291 | 292 | 293 | # Execute the tool with edited args 294 | observation = tool.invoke(edited_args) 295 | 296 | # Add only the tool response message 297 | result.append({"role": "tool", "content": observation, "tool_call_id": current_id}) 298 | 299 | # Catch all other tool calls 300 | else: 301 | raise ValueError(f"Invalid tool call: {tool_call['name']}") 302 | 303 | elif response["type"] == "ignore": 304 | if tool_call["name"] == "write_email": 305 | # Don't execute the tool, and tell the agent how to proceed 306 | result.append({"role": "tool", "content": "User ignored this email draft. Ignore this email and end the workflow.", "tool_call_id": tool_call["id"]}) 307 | # Go to END 308 | goto = END 309 | elif tool_call["name"] == "schedule_meeting": 310 | # Don't execute the tool, and tell the agent how to proceed 311 | result.append({"role": "tool", "content": "User ignored this calendar meeting draft. Ignore this email and end the workflow.", "tool_call_id": tool_call["id"]}) 312 | # Go to END 313 | goto = END 314 | elif tool_call["name"] == "Question": 315 | # Don't execute the tool, and tell the agent how to proceed 316 | result.append({"role": "tool", "content": "User ignored this question. Ignore this email and end the workflow.", "tool_call_id": tool_call["id"]}) 317 | # Go to END 318 | goto = END 319 | else: 320 | raise ValueError(f"Invalid tool call: {tool_call['name']}") 321 | 322 | elif response["type"] == "response": 323 | # User provided feedback 324 | user_feedback = response["args"] 325 | if tool_call["name"] == "write_email": 326 | # Don't execute the tool, and add a message with the user feedback to incorporate into the email 327 | result.append({"role": "tool", "content": f"User gave feedback, which can we incorporate into the email. Feedback: {user_feedback}", "tool_call_id": tool_call["id"]}) 328 | elif tool_call["name"] == "schedule_meeting": 329 | # Don't execute the tool, and add a message with the user feedback to incorporate into the email 330 | result.append({"role": "tool", "content": f"User gave feedback, which can we incorporate into the meeting request. Feedback: {user_feedback}", "tool_call_id": tool_call["id"]}) 331 | elif tool_call["name"] == "Question": 332 | # Don't execute the tool, and add a message with the user feedback to incorporate into the email 333 | result.append({"role": "tool", "content": f"User answered the question, which can we can use for any follow up actions. Feedback: {user_feedback}", "tool_call_id": tool_call["id"]}) 334 | else: 335 | raise ValueError(f"Invalid tool call: {tool_call['name']}") 336 | 337 | # Catch all other responses 338 | else: 339 | raise ValueError(f"Invalid response: {response}") 340 | 341 | # Update the state 342 | update = { 343 | "messages": result, 344 | } 345 | 346 | return Command(goto=goto, update=update) 347 | 348 | # Conditional edge function 349 | def should_continue(state: State) -> Literal["interrupt_handler", "__end__"]: 350 | """Route to tool handler, or end if Done tool called""" 351 | messages = state["messages"] 352 | last_message = messages[-1] 353 | if last_message.tool_calls: 354 | for tool_call in last_message.tool_calls: 355 | if tool_call["name"] == "Done": 356 | return END 357 | else: 358 | return "interrupt_handler" 359 | 360 | # Build workflow 361 | agent_builder = StateGraph(State) 362 | 363 | # Add nodes 364 | agent_builder.add_node("llm_call", llm_call) 365 | agent_builder.add_node("interrupt_handler", interrupt_handler) 366 | 367 | # Add edges 368 | agent_builder.add_edge(START, "llm_call") 369 | agent_builder.add_conditional_edges( 370 | "llm_call", 371 | should_continue, 372 | { 373 | "interrupt_handler": "interrupt_handler", 374 | END: END, 375 | }, 376 | ) 377 | 378 | # Compile the agent 379 | response_agent = agent_builder.compile() 380 | 381 | # Build overall workflow 382 | overall_workflow = ( 383 | StateGraph(State, input=StateInput) 384 | .add_node(triage_router) 385 | .add_node(triage_interrupt_handler) 386 | .add_node("response_agent", response_agent) 387 | .add_edge(START, "triage_router") 388 | 389 | ) 390 | 391 | email_assistant = overall_workflow.compile() -------------------------------------------------------------------------------- /src/email_assistant/langgraph_101.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | from langchain.chat_models import init_chat_model 3 | from langchain.tools import tool 4 | from langgraph.graph import MessagesState, StateGraph, END, START 5 | from dotenv import load_dotenv 6 | load_dotenv(".env") 7 | 8 | @tool 9 | def write_email(to: str, subject: str, content: str) -> str: 10 | """Write and send an email.""" 11 | # Placeholder response - in real app would send email 12 | return f"Email sent to {to} with subject '{subject}' and content: {content}" 13 | 14 | llm = init_chat_model("openai:gpt-4.1", temperature=0) 15 | model_with_tools = llm.bind_tools([write_email], tool_choice="any") 16 | 17 | def call_llm(state: MessagesState) -> MessagesState: 18 | """Run LLM""" 19 | 20 | output = model_with_tools.invoke(state["messages"]) 21 | return {"messages": [output]} 22 | 23 | def run_tool(state: MessagesState) -> MessagesState: 24 | """Performs the tool call""" 25 | 26 | result = [] 27 | for tool_call in state["messages"][-1].tool_calls: 28 | observation = write_email.invoke(tool_call["args"]) 29 | result.append({"role": "tool", "content": observation, "tool_call_id": tool_call["id"]}) 30 | return {"messages": result} 31 | 32 | def should_continue(state: MessagesState) -> Literal["run_tool", "__end__"]: 33 | """Route to tool handler, or end if Done tool called""" 34 | 35 | # Get the last message 36 | messages = state["messages"] 37 | last_message = messages[-1] 38 | 39 | # If the last message is a tool call, check if it's a Done tool call 40 | if last_message.tool_calls: 41 | return "run_tool" 42 | # Otherwise, we stop (reply to the user) 43 | return END 44 | 45 | workflow = StateGraph(MessagesState) 46 | workflow.add_node("call_llm", call_llm) 47 | workflow.add_node("run_tool", run_tool) 48 | workflow.add_edge(START, "call_llm") 49 | workflow.add_conditional_edges("call_llm", should_continue, {"run_tool": "run_tool", END: END}) 50 | workflow.add_edge("run_tool", END) 51 | 52 | app = workflow.compile() -------------------------------------------------------------------------------- /src/email_assistant/prompts.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | # Email assistant triage prompt 4 | triage_system_prompt = """ 5 | 6 | < Role > 7 | Your role is to triage incoming emails based upon instructs and background information below. 8 | 9 | 10 | < Background > 11 | {background}. 12 | 13 | 14 | < Instructions > 15 | Categorize each email into one of three categories: 16 | 1. IGNORE - Emails that are not worth responding to or tracking 17 | 2. NOTIFY - Important information that worth notification but doesn't require a response 18 | 3. RESPOND - Emails that need a direct response 19 | Classify the below email into one of these categories. 20 | 21 | 22 | < Rules > 23 | {triage_instructions} 24 | 25 | """ 26 | 27 | # Email assistant triage user prompt 28 | triage_user_prompt = """ 29 | Please determine how to handle the below email thread: 30 | 31 | From: {author} 32 | To: {to} 33 | Subject: {subject} 34 | {email_thread}""" 35 | 36 | # Email assistant prompt 37 | agent_system_prompt = """ 38 | < Role > 39 | You are a top-notch executive assistant who cares about helping your executive perform as well as possible. 40 | 41 | 42 | < Tools > 43 | You have access to the following tools to help manage communications and schedule: 44 | {tools_prompt} 45 | 46 | 47 | < Instructions > 48 | When handling emails, follow these steps: 49 | 1. Carefully analyze the email content and purpose 50 | 2. IMPORTANT --- always call a tool and call one tool at a time until the task is complete: 51 | 3. For responding to the email, draft a response email with the write_email tool 52 | 4. For meeting requests, use the check_calendar_availability tool to find open time slots 53 | 5. To schedule a meeting, use the schedule_meeting tool with a datetime object for the preferred_day parameter 54 | - Today's date is """ + datetime.now().strftime("%Y-%m-%d") + """ - use this for scheduling meetings accurately 55 | 6. If you scheduled a meeting, then draft a short response email using the write_email tool 56 | 7. After using the write_email tool, the task is complete 57 | 8. If you have sent the email, then use the Done tool to indicate that the task is complete 58 | 59 | 60 | < Background > 61 | {background} 62 | 63 | 64 | < Response Preferences > 65 | {response_preferences} 66 | 67 | 68 | < Calendar Preferences > 69 | {cal_preferences} 70 | 71 | """ 72 | 73 | # Email assistant with HITL prompt 74 | agent_system_prompt_hitl = """ 75 | < Role > 76 | You are a top-notch executive assistant who cares about helping your executive perform as well as possible. 77 | 78 | 79 | < Tools > 80 | You have access to the following tools to help manage communications and schedule: 81 | {tools_prompt} 82 | 83 | 84 | < Instructions > 85 | When handling emails, follow these steps: 86 | 1. Carefully analyze the email content and purpose 87 | 2. IMPORTANT --- always call a tool and call one tool at a time until the task is complete: 88 | 3. If the incoming email asks the user a direct question and you do not have context to answer the question, use the Question tool to ask the user for the answer 89 | 4. For responding to the email, draft a response email with the write_email tool 90 | 5. For meeting requests, use the check_calendar_availability tool to find open time slots 91 | 6. To schedule a meeting, use the schedule_meeting tool with a datetime object for the preferred_day parameter 92 | - Today's date is """ + datetime.now().strftime("%Y-%m-%d") + """ - use this for scheduling meetings accurately 93 | 7. If you scheduled a meeting, then draft a short response email using the write_email tool 94 | 8. After using the write_email tool, the task is complete 95 | 9. If you have sent the email, then use the Done tool to indicate that the task is complete 96 | 97 | 98 | < Background > 99 | {background} 100 | 101 | 102 | < Response Preferences > 103 | {response_preferences} 104 | 105 | 106 | < Calendar Preferences > 107 | {cal_preferences} 108 | 109 | """ 110 | 111 | # Email assistant with HITL and memory prompt 112 | # Note: Currently, this is the same as the HITL prompt. However, memory specific tools (see https://langchain-ai.github.io/langmem/) can be added 113 | agent_system_prompt_hitl_memory = """ 114 | < Role > 115 | You are a top-notch executive assistant. 116 | 117 | 118 | < Tools > 119 | You have access to the following tools to help manage communications and schedule: 120 | {tools_prompt} 121 | 122 | 123 | < Instructions > 124 | When handling emails, follow these steps: 125 | 1. Carefully analyze the email content and purpose 126 | 2. IMPORTANT --- always call a tool and call one tool at a time until the task is complete: 127 | 3. If the incoming email asks the user a direct question and you do not have context to answer the question, use the Question tool to ask the user for the answer 128 | 4. For responding to the email, draft a response email with the write_email tool 129 | 5. For meeting requests, use the check_calendar_availability tool to find open time slots 130 | 6. To schedule a meeting, use the schedule_meeting tool with a datetime object for the preferred_day parameter 131 | - Today's date is """ + datetime.now().strftime("%Y-%m-%d") + """ - use this for scheduling meetings accurately 132 | 7. If you scheduled a meeting, then draft a short response email using the write_email tool 133 | 8. After using the write_email tool, the task is complete 134 | 9. If you have sent the email, then use the Done tool to indicate that the task is complete 135 | 136 | 137 | < Background > 138 | {background} 139 | 140 | 141 | < Response Preferences > 142 | {response_preferences} 143 | 144 | 145 | < Calendar Preferences > 146 | {cal_preferences} 147 | 148 | """ 149 | 150 | # Default background information 151 | default_background = """ 152 | I'm Lance, a software engineer at LangChain. 153 | """ 154 | 155 | # Default response preferences 156 | default_response_preferences = """ 157 | Use professional and concise language. If the e-mail mentions a deadline, make sure to explicitly acknowledge and reference the deadline in your response. 158 | 159 | When responding to technical questions that require investigation: 160 | - Clearly state whether you will investigate or who you will ask 161 | - Provide an estimated timeline for when you'll have more information or complete the task 162 | 163 | When responding to event or conference invitations: 164 | - Always acknowledge any mentioned deadlines (particularly registration deadlines) 165 | - If workshops or specific topics are mentioned, ask for more specific details about them 166 | - If discounts (group or early bird) are mentioned, explicitly request information about them 167 | - Don't commit 168 | 169 | When responding to collaboration or project-related requests: 170 | - Acknowledge any existing work or materials mentioned (drafts, slides, documents, etc.) 171 | - Explicitly mention reviewing these materials before or during the meeting 172 | - When scheduling meetings, clearly state the specific day, date, and time proposed 173 | 174 | When responding to meeting scheduling requests: 175 | - If times are proposed, verify calendar availability for all time slots mentioned in the original email and then commit to one of the proposed times based on your availability by scheduling the meeting. Or, say you can't make it at the time proposed. 176 | - If no times are proposed, then check your calendar for availability and propose multiple time options when available instead of selecting just one. 177 | - Mention the meeting duration in your response to confirm you've noted it correctly. 178 | - Reference the meeting's purpose in your response. 179 | """ 180 | 181 | # Default calendar preferences 182 | default_cal_preferences = """ 183 | 30 minute meetings are preferred, but 15 minute meetings are also acceptable. 184 | """ 185 | 186 | # Default triage instructions 187 | default_triage_instructions = """ 188 | Emails that are not worth responding to: 189 | - Marketing newsletters and promotional emails 190 | - Spam or suspicious emails 191 | - CC'd on FYI threads with no direct questions 192 | 193 | There are also other things that should be known about, but don't require an email response. For these, you should notify (using the `notify` response). Examples of this include: 194 | - Team member out sick or on vacation 195 | - Build system notifications or deployments 196 | - Project status updates without action items 197 | - Important company announcements 198 | - FYI emails that contain relevant information for current projects 199 | - HR Department deadline reminders 200 | - Subscription status / renewal reminders 201 | - GitHub notifications 202 | 203 | Emails that are worth responding to: 204 | - Direct questions from team members requiring expertise 205 | - Meeting requests requiring confirmation 206 | - Critical bug reports related to team's projects 207 | - Requests from management requiring acknowledgment 208 | - Client inquiries about project status or features 209 | - Technical questions about documentation, code, or APIs (especially questions about missing endpoints or features) 210 | - Personal reminders related to family (wife / daughter) 211 | - Personal reminder related to self-care (doctor appointments, etc) 212 | """ -------------------------------------------------------------------------------- /src/email_assistant/schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import Optional 3 | from typing_extensions import TypedDict, Literal, Annotated 4 | from langgraph.graph import MessagesState 5 | 6 | class RouterSchema(BaseModel): 7 | """Analyze the unread email and route it according to its content.""" 8 | 9 | reasoning: str = Field( 10 | description="Step-by-step reasoning behind the classification." 11 | ) 12 | classification: Literal["ignore", "respond", "notify"] = Field( 13 | description="The classification of an email: 'ignore' for irrelevant emails, " 14 | "'notify' for important information that doesn't need a response, " 15 | "'respond' for emails that need a reply", 16 | ) 17 | 18 | class StateInput(TypedDict): 19 | # This is the input to the state 20 | email_input: dict 21 | 22 | class State(MessagesState): 23 | # This state class has the messages key build in 24 | email_input: dict 25 | classification_decision: Literal["ignore", "respond", "notify"] 26 | 27 | class EmailData(TypedDict): 28 | id: str 29 | thread_id: str 30 | from_email: str 31 | subject: str 32 | page_content: str 33 | send_time: str 34 | to_email: str -------------------------------------------------------------------------------- /src/email_assistant/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from src.email_assistant.tools.base import get_tools, get_tools_by_name 2 | from src.email_assistant.tools.default.email_tools import write_email, triage_email, Done 3 | from src.email_assistant.tools.default.calendar_tools import schedule_meeting, check_calendar_availability 4 | 5 | __all__ = [ 6 | "get_tools", 7 | "get_tools_by_name", 8 | "write_email", 9 | "triage_email", 10 | "Done", 11 | "schedule_meeting", 12 | "check_calendar_availability", 13 | ] -------------------------------------------------------------------------------- /src/email_assistant/tools/base.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Callable, Any 2 | from langchain_core.tools import BaseTool 3 | 4 | def get_tools(tool_names: List[str] = None, include_gmail: bool = False) -> List[BaseTool]: 5 | """Get specified tools or all tools if tool_names is None. 6 | 7 | Args: 8 | tool_names: Optional list of tool names to include. If None, returns all tools. 9 | include_gmail: Whether to include Gmail tools. Defaults to False. 10 | 11 | Returns: 12 | List of tool objects 13 | """ 14 | # Import default tools 15 | from src.email_assistant.tools.default.email_tools import write_email, Done, Question 16 | from src.email_assistant.tools.default.calendar_tools import schedule_meeting, check_calendar_availability 17 | 18 | # Base tools dictionary 19 | all_tools = { 20 | "write_email": write_email, 21 | "Done": Done, 22 | "Question": Question, 23 | "schedule_meeting": schedule_meeting, 24 | "check_calendar_availability": check_calendar_availability, 25 | } 26 | 27 | # Add Gmail tools if requested 28 | if include_gmail: 29 | try: 30 | from src.email_assistant.tools.gmail.gmail_tools import ( 31 | fetch_emails_tool, 32 | send_email_tool, 33 | check_calendar_tool, 34 | schedule_meeting_tool 35 | ) 36 | 37 | all_tools.update({ 38 | "fetch_emails_tool": fetch_emails_tool, 39 | "send_email_tool": send_email_tool, 40 | "check_calendar_tool": check_calendar_tool, 41 | "schedule_meeting_tool": schedule_meeting_tool, 42 | }) 43 | except ImportError: 44 | # If Gmail tools aren't available, continue without them 45 | pass 46 | 47 | if tool_names is None: 48 | return list(all_tools.values()) 49 | 50 | return [all_tools[name] for name in tool_names if name in all_tools] 51 | 52 | def get_tools_by_name(tools: List[BaseTool] = None) -> Dict[str, BaseTool]: 53 | """Get a dictionary of tools mapped by name.""" 54 | if tools is None: 55 | tools = get_tools() 56 | 57 | return {tool.name: tool for tool in tools} 58 | -------------------------------------------------------------------------------- /src/email_assistant/tools/default/__init__.py: -------------------------------------------------------------------------------- 1 | """Default tools for email assistant.""" 2 | 3 | from src.email_assistant.tools.default.email_tools import write_email, triage_email, Done 4 | from src.email_assistant.tools.default.calendar_tools import schedule_meeting, check_calendar_availability 5 | from src.email_assistant.tools.default.prompt_templates import ( 6 | STANDARD_TOOLS_PROMPT, 7 | AGENT_TOOLS_PROMPT, 8 | HITL_TOOLS_PROMPT, 9 | HITL_MEMORY_TOOLS_PROMPT 10 | ) 11 | 12 | __all__ = [ 13 | "write_email", 14 | "triage_email", 15 | "Done", 16 | "schedule_meeting", 17 | "check_calendar_availability", 18 | "STANDARD_TOOLS_PROMPT", 19 | "AGENT_TOOLS_PROMPT", 20 | "HITL_TOOLS_PROMPT", 21 | "HITL_MEMORY_TOOLS_PROMPT" 22 | ] -------------------------------------------------------------------------------- /src/email_assistant/tools/default/calendar_tools.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from langchain_core.tools import tool 3 | 4 | @tool 5 | def schedule_meeting( 6 | attendees: list[str], subject: str, duration_minutes: int, preferred_day: datetime, start_time: int 7 | ) -> str: 8 | """Schedule a calendar meeting.""" 9 | # Placeholder response - in real app would check calendar and schedule 10 | date_str = preferred_day.strftime("%A, %B %d, %Y") 11 | return f"Meeting '{subject}' scheduled on {date_str} at {start_time} for {duration_minutes} minutes with {len(attendees)} attendees" 12 | 13 | @tool 14 | def check_calendar_availability(day: str) -> str: 15 | """Check calendar availability for a given day.""" 16 | # Placeholder response - in real app would check actual calendar 17 | return f"Available times on {day}: 9:00 AM, 2:00 PM, 4:00 PM" 18 | -------------------------------------------------------------------------------- /src/email_assistant/tools/default/email_tools.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | from pydantic import BaseModel 3 | from langchain_core.tools import tool 4 | 5 | @tool 6 | def write_email(to: str, subject: str, content: str) -> str: 7 | """Write and send an email.""" 8 | # Placeholder response - in real app would send email 9 | return f"Email sent to {to} with subject '{subject}' and content: {content}" 10 | 11 | @tool 12 | def triage_email(category: Literal["ignore", "notify", "respond"]) -> str: 13 | """Triage an email into one of three categories: ignore, notify, respond.""" 14 | return f"Classification Decision: {category}" 15 | 16 | @tool 17 | class Done(BaseModel): 18 | """E-mail has been sent.""" 19 | done: bool 20 | 21 | @tool 22 | class Question(BaseModel): 23 | """Question to ask user.""" 24 | content: str 25 | -------------------------------------------------------------------------------- /src/email_assistant/tools/default/prompt_templates.py: -------------------------------------------------------------------------------- 1 | """Tool prompt templates for the email assistant.""" 2 | 3 | # Standard tool descriptions for insertion into prompts 4 | STANDARD_TOOLS_PROMPT = """ 5 | 1. triage_email(ignore, notify, respond) - Triage emails into one of three categories 6 | 2. write_email(to, subject, content) - Send emails to specified recipients 7 | 3. schedule_meeting(attendees, subject, duration_minutes, preferred_day, start_time) - Schedule calendar meetings where preferred_day is a datetime object 8 | 4. check_calendar_availability(day) - Check available time slots for a given day 9 | 5. Done - E-mail has been sent 10 | """ 11 | 12 | # Tool descriptions for HITL workflow 13 | HITL_TOOLS_PROMPT = """ 14 | 1. write_email(to, subject, content) - Send emails to specified recipients 15 | 2. schedule_meeting(attendees, subject, duration_minutes, preferred_day, start_time) - Schedule calendar meetings where preferred_day is a datetime object 16 | 3. check_calendar_availability(day) - Check available time slots for a given day 17 | 4. Question(content) - Ask the user any follow-up questions 18 | 5. Done - E-mail has been sent 19 | """ 20 | 21 | # Tool descriptions for HITL with memory workflow 22 | # Note: Additional memory specific tools could be added here 23 | HITL_MEMORY_TOOLS_PROMPT = """ 24 | 1. write_email(to, subject, content) - Send emails to specified recipients 25 | 2. schedule_meeting(attendees, subject, duration_minutes, preferred_day, start_time) - Schedule calendar meetings where preferred_day is a datetime object 26 | 3. check_calendar_availability(day) - Check available time slots for a given day 27 | 4. Question(content) - Ask the user any follow-up questions 28 | 5. Done - E-mail has been sent 29 | """ 30 | 31 | # Tool descriptions for agent workflow without triage 32 | AGENT_TOOLS_PROMPT = """ 33 | 1. write_email(to, subject, content) - Send emails to specified recipients 34 | 2. schedule_meeting(attendees, subject, duration_minutes, preferred_day, start_time) - Schedule calendar meetings where preferred_day is a datetime object 35 | 3. check_calendar_availability(day) - Check available time slots for a given day 36 | 4. Done - E-mail has been sent 37 | """ -------------------------------------------------------------------------------- /src/email_assistant/tools/gmail/README.md: -------------------------------------------------------------------------------- 1 | # Gmail Integration Tools 2 | 3 | Connect your email assistant to Gmail and Google Calendar APIs. 4 | 5 | ## Graph 6 | 7 | The `src/email_assistant/email_assistant_hitl_memory_gmail.py` graph is configured to use Gmail tools. 8 | 9 | You simply need to run the setup below to obtain the credentials needed to run the graph with your own email. 10 | 11 | ## Setup Credentials 12 | 13 | ### 1. Set up Google Cloud Project and Enable Required APIs 14 | 15 | #### Enable Gmail and Calendar APIs 16 | 17 | 1. Go to the [Google APIs Library and enable the Gmail API](https://developers.google.com/workspace/gmail/api/quickstart/python#enable_the_api) 18 | 2. Go to the [Google APIs Library and enable the Google Calendar API](https://developers.google.com/workspace/calendar/api/quickstart/python#enable_the_api) 19 | 20 | #### Create OAuth Credentials 21 | 22 | 1. Authorize credentials for a desktop application [here](https://developers.google.com/workspace/gmail/api/quickstart/python#authorize_credentials_for_a_desktop_application) 23 | 2. Go to Credentials → Create Credentials → OAuth Client ID 24 | 3. Set Application Type to "Desktop app" 25 | 4. Click "Create" 26 | 27 | > Note: If using a personal email (non-Google Workspace) select "External" under "Audience" 28 | 29 | Screenshot 2025-04-26 at 7 43 57 AM 30 | 31 | > Then, add yourself as a test user 32 | 33 | 5. Save the downloaded JSON file (you'll need this in the next step) 34 | 35 | ### 2. Set Up Authentication Files 36 | 37 | 1. Move your downloaded client secret JSON file to the `.secrets` directory 38 | 39 | ```bash 40 | # Create a secrets directory 41 | mkdir -p src/email_assistant/tools/gmail/.secrets 42 | 43 | # Move your downloaded client secret to the secrets directory 44 | mv /path/to/downloaded/client_secret.json src/email_assistant/tools/gmail/.secrets/secrets.json 45 | ``` 46 | 47 | 2. Run the Gmail setup script 48 | 49 | ```bash 50 | # Run the Gmail setup script 51 | python src/email_assistant/tools/gmail/setup_gmail.py 52 | ``` 53 | 54 | - This will open a browser window for you to authenticate with your Google account 55 | - This will create a `token.json` file in the `.secrets` directory 56 | - This token will be used for Gmail API access 57 | 58 | ## Use With A Local Deployment 59 | 60 | ### 1. Run the Gmail Ingestion Script with Locally Running LangGraph Server 61 | 62 | 1. Once you have authentication set up, run LangGraph server locally: 63 | 64 | ``` 65 | langgraph dev 66 | ``` 67 | 68 | 2. Run the ingestion script in another terminal with desired parameters: 69 | 70 | ```bash 71 | python src/email_assistant/tools/gmail/run_ingest.py --email lance@langgraph.dev --minutes-since 1000 72 | ``` 73 | 74 | - By default, this will use the local deployment URL (http://127.0.0.1:2024) and fetch emails from the past 1000 minutes. 75 | - It will use the LangGraph SDK to pass each email to the locally running email assistant. 76 | - It will use the `email_assistant_hitl_memory_gmail` graph, which is configured to use Gmail tools. 77 | 78 | #### Parameters: 79 | 80 | - `--graph-name`: Name of the LangGraph to use (default: "email_assistant_hitl_memory_gmail") 81 | - `--email`: The email address to fetch messages from (alternative to setting EMAIL_ADDRESS) 82 | - `--minutes-since`: Only process emails that are newer than this many minutes (default: 60) 83 | - `--url`: URL of the LangGraph deployment (default: http://127.0.0.1:2024) 84 | - `--rerun`: Process emails that have already been processed (default: false) 85 | - `--early`: Stop after processing one email (default: false) 86 | - `--include-read`: Include emails that have already been read (by default only unread emails are processed) 87 | - `--skip-filters`: Process all emails without filtering (by default only latest messages in threads where you're not the sender are processed) 88 | 89 | #### Troubleshooting: 90 | 91 | - **Missing emails?** The Gmail API applies filters to show only important/primary emails by default. You can: 92 | - Increase the `--minutes-since` parameter to a larger value (e.g., 1000) to fetch emails from a longer time period 93 | - Use the `--include-read` flag to process emails marked as "read" (by default only unread emails are processed) 94 | - Use the `--skip-filters` flag to include all messages (not just the latest in a thread, and including ones you sent) 95 | - Try running with all options to process everything: `--include-read --skip-filters --minutes-since 1000` 96 | - Use the `--mock` flag to test the system with simulated emails 97 | 98 | ### 2. Connect to Agent Inbox 99 | 100 | After ingestion, you can access your all interrupted threads in Agent Inbox (https://dev.agentinbox.ai/): 101 | * Deployment URL: http://127.0.0.1:2024 102 | * Assistant/Graph ID: `email_assistant_hitl_memory_gmail` 103 | * Name: `Graph Name` 104 | 105 | ## Run A Hosted Deployment 106 | 107 | ### 1. Deploy to LangGraph Platform 108 | 109 | 1. Navigate to the deployments page in LangSmith 110 | 2. Click New Deployment 111 | 3. Connect it to your fork of the [this repo](https://github.com/langchain-ai/agents-from-scratch) and desired branch 112 | 4. Give it a name like `Yourname-Email-Assistant` 113 | 5. Add the following environment variables: 114 | * `OPENAI_API_KEY` 115 | * `GMAIL_SECRET` - This is the full dictionary in `.secrets/secrets.json` 116 | * `GMAIL_TOKEN` - This is the full dictionary in `.secrets/token.json` 117 | 6. Click Submit 118 | 7. Get the `API URL` (https://your-email-assistant-xxx.us.langgraph.app) from the deployment page 119 | 120 | ### 2. Run Ingestion with Hosted Deployment 121 | 122 | Once your LangGraph deployment is up and running, you can test the email ingestion with: 123 | 124 | ```bash 125 | python src/email_assistant/tools/gmail/run_ingest.py --email lance@langchain.dev --minutes-since 2440 --include-read --url https://your-email-assistant-xxx.us.langgraph.app 126 | ``` 127 | 128 | ### 3. Connect to Agent Inbox 129 | 130 | After ingestion, you can access your all interrupted threads in Agent Inbox (https://dev.agentinbox.ai/): 131 | * Deployment URL: https://your-email-assistant-xxx.us.langgraph.app 132 | * Assistant/Graph ID: `email_assistant_hitl_memory_gmail` 133 | * Name: `Graph Name` 134 | * LangSmith API Key: `LANGSMITH_API_KEY` 135 | 136 | ### 4. Set up Cron Job 137 | 138 | With a hosted deployment, you can set up a cron job to run the ingestion script at a specified interval. 139 | 140 | To automate email ingestion, set up a scheduled cron job using the included setup script: 141 | 142 | ```bash 143 | python src/email_assistant/tools/gmail/setup_cron.py --email lance@langchain.dev --url https://lance-email-assistant-4681ae9646335abe9f39acebbde8680b.us.langgraph.app 144 | ``` 145 | 146 | #### Parameters: 147 | 148 | - `--email`: Email address to fetch messages for (required) 149 | - `--url`: LangGraph deployment URL (required) 150 | - `--minutes-since`: Only fetch emails newer than this many minutes (default: 60) 151 | - `--schedule`: Cron schedule expression (default: "*/10 * * * *" = every 10 minutes) 152 | - `--graph-name`: Name of the graph to use (default: "email_assistant_hitl_memory_gmail") 153 | - `--include-read`: Include emails marked as read (by default only unread emails are processed) (default: false) 154 | 155 | #### How the Cron Works 156 | 157 | The cron consists of two main components: 158 | 159 | 1. **`src/email_assistant/cron.py`**: Defines a simple LangGraph graph that: 160 | - Calls the same `fetch_and_process_emails` function used by `run_ingest.py` 161 | - Wraps this in a simple graph so that it can be run as a hosted cron using LangGraph Platform 162 | 163 | 2. **`src/email_assistant/tools/gmail/setup_cron.py`**: Creates the scheduled cron job: 164 | - Uses LangGraph SDK `client.crons.create` to create a cron job for the hosted `cron.py` graph 165 | 166 | #### Managing Cron Jobs 167 | 168 | To view, update, or delete existing cron jobs, you can use the LangGraph SDK: 169 | 170 | ```python 171 | from langgraph_sdk import get_client 172 | 173 | # Connect to deployment 174 | client = get_client(url="https://your-deployment-url.us.langgraph.app") 175 | 176 | # List all cron jobs 177 | cron_jobs = await client.crons.list() 178 | print(cron_jobs) 179 | 180 | # Delete a cron job 181 | await client.crons.delete(cron_job_id) 182 | ``` 183 | 184 | ## How Gmail Ingestion Works 185 | 186 | The Gmail ingestion process works in three main stages: 187 | 188 | ### 1. CLI Parameters → Gmail Search Query 189 | 190 | CLI parameters are translated into a Gmail search query: 191 | 192 | - `--minutes-since 1440` → `after:TIMESTAMP` (emails from the last 24 hours) 193 | - `--email you@example.com` → `to:you@example.com OR from:you@example.com` (emails where you're sender or recipient) 194 | - `--include-read` → removes `is:unread` filter (includes read messages) 195 | 196 | For example, running: 197 | ``` 198 | python run_ingest.py --email you@example.com --minutes-since 1440 --include-read 199 | ``` 200 | 201 | Creates a Gmail API search query like: 202 | ``` 203 | (to:you@example.com OR from:you@example.com) after:1745432245 204 | ``` 205 | 206 | ### 2. Search Results → Thread Processing 207 | 208 | For each message returned by the search: 209 | 210 | 1. The script obtains the thread ID 211 | 2. Using this thread ID, it fetches the **complete thread** with all messages 212 | 3. Messages in the thread are sorted by date to identify the latest message 213 | 4. Depending on filtering options, it processes either: 214 | - The specific message found in the search (default behavior) 215 | - The latest message in the thread (when using `--skip-filters`) 216 | 217 | ### 3. Default Filters and `--skip-filters` Behavior 218 | 219 | #### Default Filters Applied 220 | 221 | Without `--skip-filters`, the system applies these three filters in sequence: 222 | 223 | 1. **Unread Filter** (controlled by `--include-read`): 224 | - Default behavior: Only processes unread messages 225 | - With `--include-read`: Processes both read and unread messages 226 | - Implementation: Adds `is:unread` to the Gmail search query 227 | - This filter happens at the search level before any messages are retrieved 228 | 229 | 2. **Sender Filter**: 230 | - Default behavior: Skips messages sent by your own email address 231 | - Implementation: Checks if your email appears in the "From" header 232 | - Logic: `is_from_user = email_address in from_header` 233 | - This prevents the assistant from responding to your own emails 234 | 235 | 3. **Thread-Position Filter**: 236 | - Default behavior: Only processes the most recent message in each thread 237 | - Implementation: Compares message ID with the last message in thread 238 | - Logic: `is_latest_in_thread = message["id"] == last_message["id"]` 239 | - Prevents processing older messages when a newer reply exists 240 | 241 | The combination of these filters means only the latest message in each thread that was not sent by you and is unread (unless `--include-read` is specified) will be processed. 242 | 243 | #### Effect of `--skip-filters` Flag 244 | 245 | When `--skip-filters` is enabled: 246 | 247 | 1. **Bypasses Sender and Thread-Position Filters**: 248 | - Messages sent by you will be processed 249 | - Messages that aren't the latest in thread will be processed 250 | - Logic: `should_process = skip_filters or (not is_from_user and is_latest_in_thread)` 251 | 252 | 2. **Changes Which Message Is Processed**: 253 | - Without `--skip-filters`: Uses the specific message found by search 254 | - With `--skip-filters`: Always uses the latest message in the thread 255 | - Even if the latest message wasn't found in the search results 256 | 257 | 3. **Unread Filter Still Applies (unless overridden)**: 258 | - `--skip-filters` does NOT bypass the unread filter 259 | - To process read messages, you must still use `--include-read` 260 | - This is because the unread filter happens at the search level 261 | 262 | In summary: 263 | - Default: Process only unread messages where you're not the sender and that are the latest in their thread 264 | - `--skip-filters`: Process all messages found by search, using the latest message in each thread 265 | - `--include-read`: Include read messages in the search 266 | - `--include-read --skip-filters`: Most comprehensive, processes the latest message in all threads found by search 267 | 268 | ## Important Gmail API Limitations 269 | 270 | The Gmail API has several limitations that affect email ingestion: 271 | 272 | 1. **Search-Based API**: Gmail doesn't provide a direct "get all emails from timeframe" endpoint 273 | - All email retrieval relies on Gmail's search functionality 274 | - Search results can be delayed for very recent messages (indexing lag) 275 | - Search results might not include all messages that technically match criteria 276 | 277 | 2. **Two-Stage Retrieval Process**: 278 | - Initial search to find relevant message IDs 279 | - Secondary thread retrieval to get complete conversations 280 | - This two-stage process is necessary because search doesn't guarantee complete thread information -------------------------------------------------------------------------------- /src/email_assistant/tools/gmail/__init__.py: -------------------------------------------------------------------------------- 1 | """Gmail tools for email assistant.""" 2 | 3 | from src.email_assistant.tools.gmail.gmail_tools import ( 4 | fetch_emails_tool, 5 | send_email_tool, 6 | check_calendar_tool, 7 | schedule_meeting_tool 8 | ) 9 | 10 | from src.email_assistant.tools.gmail.prompt_templates import GMAIL_TOOLS_PROMPT 11 | 12 | __all__ = [ 13 | "fetch_emails_tool", 14 | "send_email_tool", 15 | "check_calendar_tool", 16 | "schedule_meeting_tool", 17 | "GMAIL_TOOLS_PROMPT" 18 | ] -------------------------------------------------------------------------------- /src/email_assistant/tools/gmail/prompt_templates.py: -------------------------------------------------------------------------------- 1 | """Tool prompt templates for Gmail integration.""" 2 | 3 | # Gmail tools prompt for insertion into agent system prompts 4 | GMAIL_TOOLS_PROMPT = """ 5 | 1. fetch_emails_tool(email_address, minutes_since) - Fetch recent emails from Gmail 6 | 2. send_email_tool(email_id, response_text, email_address, additional_recipients) - Send a reply to an email thread 7 | 3. check_calendar_tool(dates) - Check Google Calendar availability for specific dates 8 | 4. schedule_meeting_tool(attendees, title, start_time, end_time, organizer_email, timezone) - Schedule a meeting and send invites 9 | 5. triage_email(ignore, notify, respond) - Triage emails into one of three categories 10 | 6. Done - E-mail has been sent 11 | """ 12 | 13 | # Combined tools prompt (default + Gmail) for full integration 14 | COMBINED_TOOLS_PROMPT = """ 15 | 1. fetch_emails_tool(email_address, minutes_since) - Fetch recent emails from Gmail 16 | 2. send_email_tool(email_id, response_text, email_address, additional_recipients) - Send a reply to an email thread 17 | 3. check_calendar_tool(dates) - Check Google Calendar availability for specific dates 18 | 4. schedule_meeting_tool(attendees, title, start_time, end_time, organizer_email, timezone) - Schedule a meeting and send invites 19 | 5. write_email(to, subject, content) - Draft emails to specified recipients 20 | 6. triage_email(ignore, notify, respond) - Triage emails into one of three categories 21 | 7. check_calendar_availability(day) - Check available time slots for a given day 22 | 8. Done - E-mail has been sent 23 | """ -------------------------------------------------------------------------------- /src/email_assistant/tools/gmail/run_ingest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Simple Gmail ingestion script based directly on test.ipynb that works with LangSmith tracing. 4 | 5 | This script provides a minimal implementation for ingesting emails to the LangGraph server, 6 | with reliable LangSmith tracing. 7 | """ 8 | 9 | import base64 10 | import json 11 | import uuid 12 | import hashlib 13 | import asyncio 14 | import argparse 15 | import os 16 | from pathlib import Path 17 | from datetime import datetime 18 | from google.oauth2.credentials import Credentials 19 | from googleapiclient.discovery import build 20 | from langgraph_sdk import get_client 21 | 22 | # Setup paths 23 | _ROOT = Path(__file__).parent.absolute() 24 | _SECRETS_DIR = _ROOT / ".secrets" 25 | TOKEN_PATH = _SECRETS_DIR / "token.json" 26 | 27 | def extract_message_part(payload): 28 | """Extract content from a message part.""" 29 | # If this is multipart, process with preference for text/plain 30 | if payload.get("parts"): 31 | # First try to find text/plain part 32 | for part in payload["parts"]: 33 | mime_type = part.get("mimeType", "") 34 | if mime_type == "text/plain" and part.get("body", {}).get("data"): 35 | data = part["body"]["data"] 36 | return base64.urlsafe_b64decode(data).decode("utf-8") 37 | 38 | # If no text/plain found, try text/html 39 | for part in payload["parts"]: 40 | mime_type = part.get("mimeType", "") 41 | if mime_type == "text/html" and part.get("body", {}).get("data"): 42 | data = part["body"]["data"] 43 | return base64.urlsafe_b64decode(data).decode("utf-8") 44 | 45 | # If we still haven't found content, recursively check for nested parts 46 | for part in payload["parts"]: 47 | content = extract_message_part(part) 48 | if content: 49 | return content 50 | 51 | # Not multipart, try to get content directly 52 | if payload.get("body", {}).get("data"): 53 | data = payload["body"]["data"] 54 | return base64.urlsafe_b64decode(data).decode("utf-8") 55 | 56 | return "" 57 | 58 | def load_gmail_credentials(): 59 | """ 60 | Load Gmail credentials from token.json or environment variables. 61 | 62 | This function attempts to load credentials from multiple sources in this order: 63 | 1. Environment variables GMAIL_TOKEN 64 | 2. Local file at token_path (.secrets/token.json) 65 | 66 | Returns: 67 | Google OAuth2 Credentials object or None if credentials can't be loaded 68 | """ 69 | token_data = None 70 | 71 | # 1. Try environment variable 72 | env_token = os.getenv("GMAIL_TOKEN") 73 | if env_token: 74 | try: 75 | token_data = json.loads(env_token) 76 | print("Using GMAIL_TOKEN environment variable") 77 | except Exception as e: 78 | print(f"Could not parse GMAIL_TOKEN environment variable: {str(e)}") 79 | 80 | # 2. Try local file as fallback 81 | if token_data is None: 82 | if TOKEN_PATH.exists(): 83 | try: 84 | with open(TOKEN_PATH, "r") as f: 85 | token_data = json.load(f) 86 | print(f"Using token from {TOKEN_PATH}") 87 | except Exception as e: 88 | print(f"Could not load token from {TOKEN_PATH}: {str(e)}") 89 | else: 90 | print(f"Token file not found at {TOKEN_PATH}") 91 | 92 | # If we couldn't get token data from any source, return None 93 | if token_data is None: 94 | print("Could not find valid token data in any location") 95 | return None 96 | 97 | try: 98 | # Create credentials object 99 | credentials = Credentials( 100 | token=token_data.get("token"), 101 | refresh_token=token_data.get("refresh_token"), 102 | token_uri=token_data.get("token_uri", "https://oauth2.googleapis.com/token"), 103 | client_id=token_data.get("client_id"), 104 | client_secret=token_data.get("client_secret"), 105 | scopes=token_data.get("scopes", ["https://www.googleapis.com/auth/gmail.modify"]) 106 | ) 107 | return credentials 108 | except Exception as e: 109 | print(f"Error creating credentials object: {str(e)}") 110 | return None 111 | 112 | def extract_email_data(message): 113 | """Extract key information from a Gmail message.""" 114 | headers = message['payload']['headers'] 115 | 116 | # Extract key headers 117 | subject = next((h['value'] for h in headers if h['name'] == 'Subject'), 'No Subject') 118 | from_email = next((h['value'] for h in headers if h['name'] == 'From'), 'Unknown Sender') 119 | to_email = next((h['value'] for h in headers if h['name'] == 'To'), 'Unknown Recipient') 120 | date = next((h['value'] for h in headers if h['name'] == 'Date'), 'Unknown Date') 121 | 122 | # Extract message content 123 | content = extract_message_part(message['payload']) 124 | 125 | # Create email data object 126 | email_data = { 127 | "from_email": from_email, 128 | "to_email": to_email, 129 | "subject": subject, 130 | "page_content": content, 131 | "id": message['id'], 132 | "thread_id": message['threadId'], 133 | "send_time": date 134 | } 135 | 136 | return email_data 137 | 138 | async def ingest_email_to_langgraph(email_data, graph_name, url="http://127.0.0.1:2024"): 139 | """Ingest an email to LangGraph.""" 140 | # Connect to LangGraph server 141 | client = get_client(url=url) 142 | 143 | # Create a consistent UUID for the thread 144 | raw_thread_id = email_data["thread_id"] 145 | thread_id = str( 146 | uuid.UUID(hex=hashlib.md5(raw_thread_id.encode("UTF-8")).hexdigest()) 147 | ) 148 | print(f"Gmail thread ID: {raw_thread_id} → LangGraph thread ID: {thread_id}") 149 | 150 | thread_exists = False 151 | try: 152 | # Try to get existing thread info 153 | thread_info = await client.threads.get(thread_id) 154 | thread_exists = True 155 | print(f"Found existing thread: {thread_id}") 156 | except Exception as e: 157 | # If thread doesn't exist, create it 158 | print(f"Creating new thread: {thread_id}") 159 | thread_info = await client.threads.create(thread_id=thread_id) 160 | 161 | # If thread exists, clean up previous runs 162 | if thread_exists: 163 | try: 164 | # List all runs for this thread 165 | runs = await client.runs.list(thread_id) 166 | 167 | # Delete all previous runs to avoid state accumulation 168 | for run_info in runs: 169 | run_id = run_info.id 170 | print(f"Deleting previous run {run_id} from thread {thread_id}") 171 | try: 172 | await client.runs.delete(thread_id, run_id) 173 | except Exception as e: 174 | print(f"Failed to delete run {run_id}: {str(e)}") 175 | except Exception as e: 176 | print(f"Error listing/deleting runs: {str(e)}") 177 | 178 | # Update thread metadata with current email ID 179 | await client.threads.update(thread_id, metadata={"email_id": email_data["id"]}) 180 | 181 | # Create a fresh run for this email 182 | print(f"Creating run for thread {thread_id} with graph {graph_name}") 183 | 184 | run = await client.runs.create( 185 | thread_id, 186 | graph_name, 187 | input={"email_input": { 188 | "from": email_data["from_email"], 189 | "to": email_data["to_email"], 190 | "subject": email_data["subject"], 191 | "body": email_data["page_content"], 192 | "id": email_data["id"] 193 | }}, 194 | multitask_strategy="rollback", 195 | ) 196 | 197 | print(f"Run created successfully with thread ID: {thread_id}") 198 | 199 | return thread_id, run 200 | 201 | async def fetch_and_process_emails(args): 202 | """Fetch emails from Gmail and process them through LangGraph.""" 203 | # Load Gmail credentials 204 | credentials = load_gmail_credentials() 205 | if not credentials: 206 | print("Failed to load Gmail credentials") 207 | return 1 208 | 209 | # Build Gmail service 210 | service = build("gmail", "v1", credentials=credentials) 211 | 212 | # Process emails 213 | processed_count = 0 214 | 215 | try: 216 | # Get messages from the specified email address 217 | email_address = args.email 218 | 219 | # Construct Gmail search query 220 | query = f"to:{email_address} OR from:{email_address}" 221 | 222 | # Add time constraint if specified 223 | if args.minutes_since > 0: 224 | # Calculate timestamp for filtering 225 | from datetime import timedelta 226 | after = int((datetime.now() - timedelta(minutes=args.minutes_since)).timestamp()) 227 | query += f" after:{after}" 228 | 229 | # Only include unread emails unless include_read is True 230 | if not args.include_read: 231 | query += " is:unread" 232 | 233 | print(f"Gmail search query: {query}") 234 | 235 | # Execute the search 236 | results = service.users().messages().list(userId="me", q=query).execute() 237 | messages = results.get("messages", []) 238 | 239 | if not messages: 240 | print("No emails found matching the criteria") 241 | return 0 242 | 243 | print(f"Found {len(messages)} emails") 244 | 245 | # Process each email 246 | for i, message_info in enumerate(messages): 247 | # Stop early if requested 248 | if args.early and i > 0: 249 | print(f"Early stop after processing {i} emails") 250 | break 251 | 252 | # Check if we should reprocess this email 253 | if not args.rerun: 254 | # TODO: Add check for already processed emails 255 | pass 256 | 257 | # Get the full message 258 | message = service.users().messages().get(userId="me", id=message_info["id"]).execute() 259 | 260 | # Extract email data 261 | email_data = extract_email_data(message) 262 | 263 | print(f"\nProcessing email {i+1}/{len(messages)}:") 264 | print(f"From: {email_data['from_email']}") 265 | print(f"Subject: {email_data['subject']}") 266 | 267 | # Ingest to LangGraph 268 | thread_id, run = await ingest_email_to_langgraph( 269 | email_data, 270 | args.graph_name, 271 | url=args.url 272 | ) 273 | 274 | processed_count += 1 275 | 276 | print(f"\nProcessed {processed_count} emails successfully") 277 | return 0 278 | 279 | except Exception as e: 280 | print(f"Error processing emails: {str(e)}") 281 | return 1 282 | 283 | def parse_args(): 284 | """Parse command line arguments.""" 285 | parser = argparse.ArgumentParser(description="Simple Gmail ingestion for LangGraph with reliable tracing") 286 | 287 | parser.add_argument( 288 | "--email", 289 | type=str, 290 | required=True, 291 | help="Email address to fetch messages for" 292 | ) 293 | parser.add_argument( 294 | "--minutes-since", 295 | type=int, 296 | default=120, 297 | help="Only retrieve emails newer than this many minutes" 298 | ) 299 | parser.add_argument( 300 | "--graph-name", 301 | type=str, 302 | default="email_assistant_hitl_memory_gmail", 303 | help="Name of the LangGraph to use" 304 | ) 305 | parser.add_argument( 306 | "--url", 307 | type=str, 308 | default="http://127.0.0.1:2024", 309 | help="URL of the LangGraph deployment" 310 | ) 311 | parser.add_argument( 312 | "--early", 313 | action="store_true", 314 | help="Early stop after processing one email" 315 | ) 316 | parser.add_argument( 317 | "--include-read", 318 | action="store_true", 319 | help="Include emails that have already been read" 320 | ) 321 | parser.add_argument( 322 | "--rerun", 323 | action="store_true", 324 | help="Process the same emails again even if already processed" 325 | ) 326 | parser.add_argument( 327 | "--skip-filters", 328 | action="store_true", 329 | help="Skip filtering of emails" 330 | ) 331 | return parser.parse_args() 332 | 333 | if __name__ == "__main__": 334 | # Get command line arguments 335 | args = parse_args() 336 | 337 | # Run the script 338 | exit(asyncio.run(fetch_and_process_emails(args))) -------------------------------------------------------------------------------- /src/email_assistant/tools/gmail/setup_cron.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Setup cron job for email ingestion in LangGraph. 4 | 5 | This script creates a scheduled cron job in LangGraph that periodically 6 | runs the email ingestion graph to process new emails. 7 | """ 8 | 9 | import argparse 10 | import asyncio 11 | from typing import Optional 12 | from langgraph_sdk import get_client 13 | 14 | async def main( 15 | email: str, 16 | url: Optional[str] = None, 17 | minutes_since: int = 60, 18 | schedule: str = "*/10 * * * *", 19 | graph_name: str = "email_assistant_hitl_memory_gmail", 20 | include_read: bool = False, 21 | ): 22 | """Set up a cron job for email ingestion""" 23 | # Connect to LangGraph server 24 | if url is None: 25 | client = get_client(url="http://127.0.0.1:2024") 26 | else: 27 | client = get_client(url=url) 28 | 29 | # Create cron job configuration 30 | cron_input = { 31 | "email": email, 32 | "minutes_since": minutes_since, 33 | "graph_name": graph_name, 34 | "url": url if url else "http://127.0.0.1:2024", 35 | "include_read": include_read, 36 | "rerun": False, 37 | "early": False, 38 | "skip_filters": False 39 | } 40 | 41 | # Register the cron job 42 | cron = await client.crons.create( 43 | "cron", # The graph name for the cron 44 | schedule=schedule, # Cron schedule expression 45 | input=cron_input # Input parameters for the cron graph 46 | ) 47 | 48 | print(f"Cron job created successfully with schedule: {schedule}") 49 | print(f"Email ingestion will run for: {email}") 50 | print(f"Processing emails from the past {minutes_since} minutes") 51 | print(f"Using graph: {graph_name}") 52 | 53 | return cron 54 | 55 | if __name__ == "__main__": 56 | parser = argparse.ArgumentParser(description="Set up a cron job for email ingestion in LangGraph") 57 | 58 | parser.add_argument( 59 | "--email", 60 | type=str, 61 | required=True, 62 | help="Email address to fetch messages for", 63 | ) 64 | parser.add_argument( 65 | "--url", 66 | type=str, 67 | required=True, 68 | help="URL to the LangGraph server", 69 | ) 70 | parser.add_argument( 71 | "--minutes-since", 72 | type=int, 73 | default=60, 74 | help="Only process emails that are less than this many minutes old", 75 | ) 76 | parser.add_argument( 77 | "--schedule", 78 | type=str, 79 | default="*/10 * * * *", 80 | help="Cron schedule expression (default: every 10 minutes)", 81 | ) 82 | parser.add_argument( 83 | "--graph-name", 84 | type=str, 85 | default="email_assistant_hitl_memory_gmail", 86 | help="Name of the graph to use for processing emails", 87 | ) 88 | parser.add_argument( 89 | "--include-read", 90 | action="store_true", 91 | help="Include emails that have already been read", 92 | ) 93 | 94 | args = parser.parse_args() 95 | 96 | asyncio.run( 97 | main( 98 | email=args.email, 99 | url=args.url, 100 | minutes_since=args.minutes_since, 101 | schedule=args.schedule, 102 | graph_name=args.graph_name, 103 | include_read=args.include_read, 104 | ) 105 | ) -------------------------------------------------------------------------------- /src/email_assistant/tools/gmail/setup_gmail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Setup script for Gmail API integration. 4 | 5 | This script handles the OAuth flow for Gmail API access by: 6 | 1. Creating a .secrets directory if it doesn't exist 7 | 2. Using credentials from .secrets/secrets.json to authenticate 8 | 3. Opening a browser window for user authentication 9 | 4. Storing the access token in .secrets/token.json 10 | """ 11 | 12 | import os 13 | import sys 14 | import json 15 | from pathlib import Path 16 | 17 | # Add project root to sys.path for imports to work correctly 18 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))) 19 | 20 | # Import required Google libraries 21 | from google_auth_oauthlib.flow import InstalledAppFlow 22 | from google.oauth2.credentials import Credentials 23 | 24 | def main(): 25 | """Run Gmail authentication setup.""" 26 | # Create .secrets directory 27 | secrets_dir = Path(__file__).parent.absolute() / ".secrets" 28 | secrets_dir.mkdir(parents=True, exist_ok=True) 29 | 30 | # Check for secrets.json 31 | secrets_path = secrets_dir / "secrets.json" 32 | if not secrets_path.exists(): 33 | print(f"Error: Client secrets file not found at {secrets_path}") 34 | print("Please download your OAuth client ID JSON from Google Cloud Console") 35 | print("and save it as .secrets/secrets.json") 36 | return 1 37 | 38 | print("Starting Gmail API authentication flow...") 39 | print("A browser window will open for you to authorize access.") 40 | 41 | # This will trigger the OAuth flow and create token.json 42 | try: 43 | # Define the scopes we need 44 | SCOPES = [ 45 | 'https://www.googleapis.com/auth/gmail.modify', 46 | 'https://www.googleapis.com/auth/calendar' 47 | ] 48 | 49 | # Load client secrets 50 | with open(secrets_path, 'r') as f: 51 | client_config = json.load(f) 52 | 53 | # Create the flow using the client_secrets.json format 54 | flow = InstalledAppFlow.from_client_secrets_file( 55 | str(secrets_path), 56 | SCOPES 57 | ) 58 | 59 | # Run the OAuth flow 60 | credentials = flow.run_local_server(port=0) 61 | 62 | # Save the credentials to token.json 63 | token_path = secrets_dir / "token.json" 64 | token_data = { 65 | 'token': credentials.token, 66 | 'refresh_token': credentials.refresh_token, 67 | 'token_uri': credentials.token_uri, 68 | 'client_id': credentials.client_id, 69 | 'client_secret': credentials.client_secret, 70 | 'scopes': credentials.scopes, 71 | 'universe_domain': 'googleapis.com', 72 | 'account': '', 73 | 'expiry': credentials.expiry.isoformat() + "Z" 74 | } 75 | 76 | with open(token_path, 'w') as token_file: 77 | json.dump(token_data, token_file) 78 | 79 | print("\nAuthentication successful!") 80 | print(f"Access token stored at {token_path}") 81 | return 0 82 | except Exception as e: 83 | print(f"Authentication failed: {str(e)}") 84 | return 1 85 | 86 | if __name__ == "__main__": 87 | exit(main()) -------------------------------------------------------------------------------- /src/email_assistant/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Any 2 | import io 3 | import sys 4 | import json 5 | import html2text 6 | 7 | def format_email_markdown(subject, author, to, email_thread, email_id=None): 8 | """Format email details into a nicely formatted markdown string for display 9 | 10 | Args: 11 | subject: Email subject 12 | author: Email sender 13 | to: Email recipient 14 | email_thread: Email content 15 | email_id: Optional email ID (for Gmail API) 16 | """ 17 | id_section = f"\n**ID**: {email_id}" if email_id else "" 18 | 19 | return f""" 20 | 21 | **Subject**: {subject} 22 | **From**: {author} 23 | **To**: {to}{id_section} 24 | 25 | {email_thread} 26 | 27 | --- 28 | """ 29 | 30 | def format_gmail_markdown(subject, author, to, email_thread, email_id=None): 31 | """Format Gmail email details into a nicely formatted markdown string for display, 32 | with HTML to text conversion for HTML content 33 | 34 | Args: 35 | subject: Email subject 36 | author: Email sender 37 | to: Email recipient 38 | email_thread: Email content (possibly HTML) 39 | email_id: Optional email ID (for Gmail API) 40 | """ 41 | id_section = f"\n**ID**: {email_id}" if email_id else "" 42 | 43 | # Check if email_thread is HTML content and convert to text if needed 44 | if email_thread and (email_thread.strip().startswith(" dict: 112 | """Parse an email input dictionary. 113 | 114 | Args: 115 | email_input (dict): Dictionary containing email fields: 116 | - author: Sender's name and email 117 | - to: Recipient's name and email 118 | - subject: Email subject line 119 | - email_thread: Full email content 120 | 121 | Returns: 122 | tuple[str, str, str, str]: Tuple containing: 123 | - author: Sender's name and email 124 | - to: Recipient's name and email 125 | - subject: Email subject line 126 | - email_thread: Full email content 127 | """ 128 | return ( 129 | email_input["author"], 130 | email_input["to"], 131 | email_input["subject"], 132 | email_input["email_thread"], 133 | ) 134 | 135 | def parse_gmail(email_input: dict) -> tuple[str, str, str, str, str]: 136 | """Parse an email input dictionary for Gmail, including the email ID. 137 | 138 | This function extends parse_email by also returning the email ID, 139 | which is used specifically in the Gmail integration. 140 | 141 | Args: 142 | email_input (dict): Dictionary containing email fields in any of these formats: 143 | Gmail schema: 144 | - From: Sender's email 145 | - To: Recipient's email 146 | - Subject: Email subject line 147 | - Body: Full email content 148 | - Id: Gmail message ID 149 | 150 | Returns: 151 | tuple[str, str, str, str, str]: Tuple containing: 152 | - author: Sender's name and email 153 | - to: Recipient's name and email 154 | - subject: Email subject line 155 | - email_thread: Full email content 156 | - email_id: Email ID (or None if not available) 157 | """ 158 | 159 | print("!Email_input from Gmail!") 160 | print(email_input) 161 | 162 | # Gmail schema 163 | return ( 164 | email_input["from"], 165 | email_input["to"], 166 | email_input["subject"], 167 | email_input["body"], 168 | email_input["id"], 169 | ) 170 | 171 | def extract_message_content(message) -> str: 172 | """Extract content from different message types as clean string. 173 | 174 | Args: 175 | message: A message object (HumanMessage, AIMessage, ToolMessage) 176 | 177 | Returns: 178 | str: Extracted content as clean string 179 | """ 180 | content = message.content 181 | 182 | # Check for recursion marker in string 183 | if isinstance(content, str) and ' List[str]: 236 | """Extract tool call names from messages, safely handling messages without tool_calls.""" 237 | tool_call_names = [] 238 | for message in messages: 239 | # Check if message is a dict and has tool_calls 240 | if isinstance(message, dict) and message.get("tool_calls"): 241 | tool_call_names.extend([call["name"].lower() for call in message["tool_calls"]]) 242 | # Check if message is an object with tool_calls attribute 243 | elif hasattr(message, "tool_calls") and message.tool_calls: 244 | tool_call_names.extend([call["name"].lower() for call in message.tool_calls]) 245 | 246 | return tool_call_names 247 | 248 | def format_messages_string(messages: List[Any]) -> str: 249 | """Format messages into a single string for analysis.""" 250 | # Redirect stdout to capture output 251 | old_stdout = sys.stdout 252 | new_stdout = io.StringIO() 253 | sys.stdout = new_stdout 254 | 255 | # Run the pretty_print calls 256 | for m in messages: 257 | m.pretty_print() 258 | 259 | # Get the captured output 260 | output = new_stdout.getvalue() 261 | 262 | # Restore original stdout 263 | sys.stdout = old_stdout 264 | 265 | return output 266 | 267 | def show_graph(graph, xray=False): 268 | """Display a LangGraph mermaid diagram with fallback rendering. 269 | 270 | Handles timeout errors from mermaid.ink by falling back to pyppeteer. 271 | 272 | Args: 273 | graph: The LangGraph object that has a get_graph() method 274 | """ 275 | from IPython.display import Image 276 | try: 277 | # Try the default renderer first 278 | return Image(graph.get_graph(xray=xray).draw_mermaid_png()) 279 | except Exception as e: 280 | # Fall back to pyppeteer if the default renderer fails 281 | import nest_asyncio 282 | nest_asyncio.apply() 283 | from langchain_core.runnables.graph import MermaidDrawMethod 284 | return Image(graph.get_graph().draw_mermaid_png(draw_method=MermaidDrawMethod.PYPPETEER)) -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pytest 4 | import sys 5 | from pathlib import Path 6 | 7 | project_root = str(Path(__file__).parent.parent) 8 | if project_root not in sys.path: 9 | sys.path.append(project_root) 10 | 11 | def pytest_addoption(parser): 12 | """Add command-line options to pytest.""" 13 | parser.addoption( 14 | "--agent-module", 15 | action="store", 16 | default="email_assistant_hitl_memory", 17 | help="Specify which email assistant module to test" 18 | ) 19 | 20 | @pytest.fixture(scope="session") 21 | def agent_module_name(request): 22 | """Return the agent module name from command line.""" 23 | return request.config.getoption("--agent-module") -------------------------------------------------------------------------------- /tests/run_all_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import subprocess 4 | import sys 5 | import argparse 6 | from pathlib import Path 7 | 8 | def main(): 9 | # LangSmith suite / project name 10 | langsmith_project = "E-Mail Assistant Testing" 11 | 12 | # Parse command line arguments 13 | parser = argparse.ArgumentParser(description="Run tests for email assistant implementations") 14 | parser.add_argument("--rich-output", action="store_true", help="[DEPRECATED] LangSmith output is now enabled by default") 15 | parser.add_argument("--experiment-name", help="Name for the LangSmith experiment") 16 | parser.add_argument("--implementation", help="Run tests for a specific implementation") 17 | parser.add_argument("--all", action="store_true", help="Run tests for all implementations") 18 | args = parser.parse_args() 19 | 20 | # Base pytest options 21 | base_pytest_options = ["-v", "--disable-warnings", "--langsmith-output"] 22 | # The --langsmith-output flag is now enabled by default for all test runs 23 | # The --rich-output flag is kept for backward compatibility 24 | 25 | # Define available implementations 26 | implementations = [ 27 | "email_assistant", 28 | "email_assistant_hitl", 29 | "email_assistant_hitl_memory", 30 | ] 31 | 32 | # Determine which implementations to test 33 | if args.implementation: 34 | if args.implementation in implementations: 35 | implementations_to_test = [args.implementation] 36 | else: 37 | print(f"Error: Unknown implementation '{args.implementation}'") 38 | print(f"Available implementations: {', '.join(implementations)}") 39 | return 1 40 | elif args.all: 41 | implementations_to_test = implementations 42 | else: 43 | # Default to testing all implementations 44 | implementations_to_test = implementations 45 | 46 | # Run tests for each implementation 47 | for implementation in implementations_to_test: 48 | print(f"\nRunning tests for {implementation}...") 49 | 50 | # Set up LangSmith environment for this implementation 51 | os.environ["LANGSMITH_PROJECT"] = langsmith_project 52 | os.environ["LANGSMITH_TEST_SUITE"] = langsmith_project 53 | 54 | # Ensure tracing is enabled 55 | os.environ["LANGCHAIN_TRACING_V2"] = "true" 56 | 57 | # Create a fresh copy of the pytest options for this run 58 | pytest_options = base_pytest_options.copy() 59 | 60 | # Add the module parameter for this specific implementation 61 | module_param = f"--agent-module={implementation}" 62 | pytest_options.append(module_param) 63 | 64 | # Determine which test files to run based on implementation 65 | test_files = ["test_response.py"] # All implementations run response tests 66 | 67 | # Run each test file 68 | print(f" Project: {langsmith_project}") 69 | print(f"\nℹ️ Test results for {implementation} are being logged to LangSmith") 70 | for test_file in test_files: 71 | print(f"\nRunning {test_file} for {implementation}...") 72 | experiment_name = f"Test: {test_file.split('/')[-1]} | Agent: {implementation}" 73 | print(f" Experiment: {experiment_name}") 74 | os.environ["LANGSMITH_EXPERIMENT"] = experiment_name 75 | 76 | # Run pytest from the tests directory 77 | cmd = ["python", "-m", "pytest", test_file] + pytest_options 78 | 79 | # Change to the script's directory to ensure correct imports 80 | script_dir = Path(__file__).parent 81 | cwd = os.getcwd() 82 | os.chdir(script_dir) 83 | result = subprocess.run(cmd, capture_output=True, text=True) 84 | os.chdir(cwd) # Restore original working directory 85 | 86 | # Print test output 87 | print(result.stdout) 88 | if result.stderr: 89 | print(result.stderr) 90 | 91 | if __name__ == "__main__": 92 | sys.exit(main() or 0) -------------------------------------------------------------------------------- /tests/test_notebooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import nbformat 5 | from nbconvert.preprocessors import ExecutePreprocessor 6 | from pathlib import Path 7 | import pytest 8 | 9 | NOTEBOOKS_DIR = Path(__file__).parent.parent / "notebooks" 10 | 11 | # Skip notebooks that require specific setup or take too long to execute in automated tests 12 | SKIP_NOTEBOOKS = [] 13 | 14 | def get_notebooks(): 15 | """Get all notebook paths except those in the skip list.""" 16 | notebooks = [] 17 | for nb_path in NOTEBOOKS_DIR.glob("**/*.ipynb"): 18 | if nb_path.name not in SKIP_NOTEBOOKS and not nb_path.name.startswith("."): 19 | notebooks.append(nb_path) 20 | return notebooks 21 | 22 | @pytest.mark.parametrize("notebook_path", get_notebooks()) 23 | def test_notebook_runs_without_errors(notebook_path): 24 | """Test that a notebook runs without errors.""" 25 | # Check if notebook exists 26 | if not notebook_path.exists(): 27 | pytest.skip(f"Notebook {notebook_path} does not exist") 28 | 29 | print(f"Testing notebook: {notebook_path}") 30 | 31 | # Read the notebook 32 | with open(notebook_path, encoding="utf-8") as f: 33 | nb = nbformat.read(f, as_version=4) 34 | 35 | # Create executor 36 | ep = ExecutePreprocessor(timeout=600, kernel_name="python3") 37 | 38 | try: 39 | # Execute the notebook 40 | ep.preprocess(nb, {"metadata": {"path": notebook_path.parent}}) 41 | except Exception as e: 42 | # Get the cell that caused the error 43 | for cell in nb.cells: 44 | if hasattr(cell, "outputs"): 45 | for output in cell.outputs: 46 | if output.output_type == "error": 47 | error_message = "\n".join(output.traceback) 48 | pytest.fail(f"Error in notebook {notebook_path}: {error_message}") 49 | # If we couldn't find the error in the notebook, raise the original exception 50 | pytest.fail(f"Error in notebook {notebook_path}: {str(e)}") 51 | 52 | if __name__ == "__main__": 53 | # This allows the script to be run directly 54 | notebooks = get_notebooks() 55 | for notebook in notebooks: 56 | try: 57 | test_notebook_runs_without_errors(notebook) 58 | print(f"✅ {notebook.name} passed") 59 | except Exception as e: 60 | print(f"❌ {notebook.name} failed: {str(e)}") 61 | sys.exit(1) 62 | print(f"All {len(notebooks)} notebooks executed successfully!") -------------------------------------------------------------------------------- /tests/test_response.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import uuid 4 | import importlib 5 | import sys 6 | import pytest 7 | from typing import Dict, List, Any, Tuple 8 | from pydantic import BaseModel, Field 9 | from langchain.chat_models import init_chat_model 10 | 11 | from langsmith import testing as t 12 | 13 | from langgraph.checkpoint.memory import MemorySaver 14 | from langgraph.store.memory import InMemoryStore 15 | from langgraph.types import Command 16 | 17 | from src.email_assistant.utils import extract_tool_calls, format_messages_string 18 | from eval.prompts import RESPONSE_CRITERIA_SYSTEM_PROMPT 19 | 20 | from dotenv import load_dotenv 21 | load_dotenv(".env", override=True) 22 | 23 | # Force reload the email_dataset module to ensure we get the latest version 24 | if "eval.email_dataset" in sys.modules: 25 | importlib.reload(sys.modules["eval.email_dataset"]) 26 | from eval.email_dataset import email_inputs, email_names, response_criteria_list, triage_outputs_list, expected_tool_calls 27 | 28 | class CriteriaGrade(BaseModel): 29 | """Score the response against specific criteria.""" 30 | grade: bool = Field(description="Does the response meet the provided criteria?") 31 | justification: str = Field(description="The justification for the grade and score, including specific examples from the response.") 32 | 33 | # Create a global LLM for evaluation to avoid recreating it for each test 34 | criteria_eval_llm = init_chat_model("openai:gpt-4o") 35 | criteria_eval_structured_llm = criteria_eval_llm.with_structured_output(CriteriaGrade) 36 | 37 | # Global variables for module name and imported module 38 | AGENT_MODULE = None 39 | agent_module = None 40 | 41 | @pytest.fixture(autouse=True, scope="function") 42 | def set_agent_module(agent_module_name): 43 | """Set the global AGENT_MODULE for each test function. 44 | Using scope="function" ensures we get a fresh import for each test.""" 45 | global AGENT_MODULE, agent_module 46 | AGENT_MODULE = agent_module_name 47 | print(f"Using agent module: {AGENT_MODULE}") 48 | 49 | # Force reload the module to ensure we get the latest code 50 | if f"src.email_assistant.{AGENT_MODULE}" in sys.modules: 51 | importlib.reload(sys.modules[f"src.email_assistant.{AGENT_MODULE}"]) 52 | 53 | agent_module = importlib.import_module(f"src.email_assistant.{AGENT_MODULE}") 54 | return AGENT_MODULE 55 | 56 | def setup_assistant() -> Tuple[Any, Dict[str, Any], InMemoryStore]: 57 | """ 58 | Setup the email assistant and create thread configuration. 59 | Returns the assistant, thread config, and store. 60 | """ 61 | # Set up checkpointer and store 62 | checkpointer = MemorySaver() 63 | store = InMemoryStore() 64 | 65 | # Create a thread ID and config 66 | thread_id = uuid.uuid4() 67 | thread_config = {"configurable": {"thread_id": thread_id}} 68 | 69 | # Compile the graph based on module type 70 | if AGENT_MODULE == "email_assistant_hitl_memory": 71 | # Memory implementation needs a store and a checkpointer 72 | email_assistant = agent_module.overall_workflow.compile(checkpointer=checkpointer, store=store) 73 | elif AGENT_MODULE in ["email_assistant_hitl"]: 74 | # Just use a checkpointer for HITL version 75 | email_assistant = agent_module.overall_workflow.compile(checkpointer=checkpointer) 76 | else: 77 | # Just use a checkpointer for other versions 78 | email_assistant = agent_module.overall_workflow.compile(checkpointer=checkpointer) 79 | store = None 80 | 81 | return email_assistant, thread_config, store 82 | 83 | def extract_values(state: Any) -> Dict[str, Any]: 84 | """Extract values from state object regardless of type.""" 85 | if hasattr(state, "values"): 86 | return state.values 87 | else: 88 | return state 89 | 90 | def run_initial_stream(email_assistant: Any, email_input: Dict, thread_config: Dict) -> List[Dict]: 91 | """Run the initial stream and return collected messages.""" 92 | messages = [] 93 | for chunk in email_assistant.stream({"email_input": email_input}, config=thread_config): 94 | messages.append(chunk) 95 | return messages 96 | 97 | def run_stream_with_command(email_assistant: Any, command: Command, thread_config: Dict) -> List[Dict]: 98 | """Run stream with a command and return collected messages.""" 99 | messages = [] 100 | for chunk in email_assistant.stream(command, config=thread_config): 101 | messages.append(chunk) 102 | return messages 103 | 104 | def is_module_compatible(required_modules: List[str]) -> bool: 105 | """Check if current module is compatible with test. 106 | 107 | Returns: 108 | bool: True if module is compatible, False otherwise 109 | """ 110 | return AGENT_MODULE in required_modules 111 | 112 | def create_response_test_cases(): 113 | """Create test cases for parametrized criteria evaluation with LangSmith. 114 | Only includes emails that require a response (triage_output == "respond"). 115 | These are more relevant / interesting for testing tool calling / response quality. 116 | """ 117 | 118 | # Create tuples of (email_input, email_name, criteria) for parametrization 119 | # Only include emails that require a response (triage_output == "respond") 120 | test_cases = [] 121 | for i, (email_input, email_name, criteria, triage_output, expected_calls) in enumerate(zip( 122 | email_inputs, email_names, response_criteria_list, triage_outputs_list, expected_tool_calls 123 | )): 124 | if triage_output == "respond": 125 | # No need to include triage_output since we're filtering for "respond" only 126 | # Each test case is (email_input, email_name, criteria, expected_calls) 127 | test_cases.append((email_input, email_name, criteria, expected_calls)) 128 | 129 | print(f"Created {len(test_cases)} test cases for emails requiring responses") 130 | return test_cases 131 | 132 | # Reference output key 133 | @pytest.mark.langsmith(output_keys=["expected_calls"]) 134 | # Variable names and a list of tuples with the test cases 135 | @pytest.mark.parametrize("email_input,email_name,criteria,expected_calls",create_response_test_cases()) 136 | def test_email_dataset_tool_calls(email_input, email_name, criteria, expected_calls): 137 | """Test if email processing contains expected tool calls.""" 138 | # Log minimal inputs for LangSmith 139 | t.log_inputs({"module": AGENT_MODULE, "test": "test_email_dataset_tool_calls"}) 140 | 141 | print(f"Processing {email_name}...") 142 | 143 | # Set up the assistant 144 | email_assistant, thread_config, _ = setup_assistant() 145 | 146 | # Run the agent 147 | if AGENT_MODULE == "email_assistant": 148 | # Workflow agent takes email_input directly 149 | result = email_assistant.invoke({"email_input": email_input}, config=thread_config) 150 | 151 | elif AGENT_MODULE in ["email_assistant_hitl", "email_assistant_hitl_memory"]: 152 | # HITL agents need special handling with multiple interrupts 153 | 154 | # Create a function to process chunks and handle interrupts recursively 155 | def process_stream(input_data): 156 | result = {} 157 | # Stream and process all chunks 158 | for chunk in email_assistant.stream(input_data, config=thread_config): 159 | # Update result with chunk data 160 | result.update(chunk) 161 | # If we hit an interrupt, handle it with accept and continue 162 | if "__interrupt__" in chunk: 163 | # Create accept command 164 | resume_command = Command(resume=[{"type": "accept", "args": ""}]) 165 | # Recursively process the accept command 166 | interrupt_result = process_stream(resume_command) 167 | # Update result with interrupt processing result 168 | result.update(interrupt_result) 169 | return result 170 | 171 | # Start processing with the email input 172 | process_stream({"email_input": email_input}) 173 | else: 174 | # Other agents take email_input directly but will use interrupt 175 | _ = run_initial_stream(email_assistant, email_input, thread_config) 176 | 177 | # Provide feedback and resume the graph with 'accept' 178 | resume_command = Command(resume=[{"type": "accept", "args": ""}]) 179 | _ = run_stream_with_command(email_assistant, resume_command, thread_config) 180 | 181 | # Get the final state 182 | state = email_assistant.get_state(thread_config) 183 | values = extract_values(state) 184 | 185 | # Extract tool calls from messages 186 | extracted_tool_calls = extract_tool_calls(values["messages"]) 187 | 188 | # Check if all expected tool calls are in the extracted ones 189 | missing_calls = [call for call in expected_calls if call.lower() not in extracted_tool_calls] 190 | # Extra calls are allowed (we only fail if expected calls are missing) 191 | extra_calls = [call for call in extracted_tool_calls if call.lower() not in [c.lower() for c in expected_calls]] 192 | 193 | # Log 194 | all_messages_str = format_messages_string(values["messages"]) 195 | t.log_outputs({ 196 | "extracted_tool_calls": extracted_tool_calls, 197 | "missing_calls": missing_calls, 198 | "extra_calls": extra_calls, 199 | "response": all_messages_str 200 | }) 201 | 202 | # Pass feedback key 203 | assert len(missing_calls) == 0 204 | 205 | # Reference output key 206 | @pytest.mark.langsmith(output_keys=["criteria"]) 207 | # Variable names and a list of tuples with the test cases 208 | # Each test case is (email_input, email_name, criteria, expected_calls) 209 | @pytest.mark.parametrize("email_input,email_name,criteria,expected_calls",create_response_test_cases()) 210 | def test_response_criteria_evaluation(email_input, email_name, criteria, expected_calls): 211 | """Test if a response meets the specified criteria. 212 | Only runs on emails that require a response. 213 | """ 214 | # Log minimal inputs for LangSmith 215 | t.log_inputs({"module": AGENT_MODULE, "test": "test_response_criteria_evaluation"}) 216 | 217 | print(f"Processing {email_name}...") 218 | 219 | # Set up the assistant 220 | email_assistant, thread_config, _ = setup_assistant() 221 | 222 | # Run the agent 223 | if AGENT_MODULE == "email_assistant": 224 | # Workflow agent takes email_input directly 225 | result = email_assistant.invoke({"email_input": email_input}, config=thread_config) 226 | 227 | elif AGENT_MODULE in ["email_assistant_hitl", "email_assistant_hitl_memory"]: 228 | # HITL agents need special handling with multiple interrupts 229 | 230 | # Create a function to process chunks and handle interrupts recursively 231 | def process_stream(input_data): 232 | result = {} 233 | # Stream and process all chunks 234 | for chunk in email_assistant.stream(input_data, config=thread_config): 235 | # Update result with chunk data 236 | result.update(chunk) 237 | # If we hit an interrupt, handle it with accept and continue 238 | if "__interrupt__" in chunk: 239 | # Create accept command 240 | resume_command = Command(resume=[{"type": "accept", "args": ""}]) 241 | # Recursively process the accept command 242 | interrupt_result = process_stream(resume_command) 243 | # Update result with interrupt processing result 244 | result.update(interrupt_result) 245 | return result 246 | 247 | # Start processing with the email input 248 | process_stream({"email_input": email_input}) 249 | else: 250 | # Other agents take email_input directly but will use interrupt 251 | _ = run_initial_stream(email_assistant, email_input, thread_config) 252 | 253 | # Provide feedback and resume the graph with 'accept' 254 | resume_command = Command(resume=[{"type": "accept", "args": ""}]) 255 | _ = run_stream_with_command(email_assistant, resume_command, thread_config) 256 | 257 | # Get the final state 258 | state = email_assistant.get_state(thread_config) 259 | values = extract_values(state) 260 | 261 | # Generate message output string for evaluation 262 | all_messages_str = format_messages_string(values['messages']) 263 | 264 | # Evaluate against criteria 265 | eval_result = criteria_eval_structured_llm.invoke([ 266 | {"role": "system", 267 | "content": RESPONSE_CRITERIA_SYSTEM_PROMPT}, 268 | {"role": "user", 269 | "content": f"""\n\n Response criteria: {criteria} \n\n Assistant's response: \n\n {all_messages_str} \n\n Evaluate whether the assistant's response meets the criteria and provide justification for your evaluation."""} 270 | ]) 271 | 272 | # Log feedback response 273 | t.log_outputs({ 274 | "justification": eval_result.justification, 275 | "response": all_messages_str, 276 | }) 277 | 278 | # Pass feedback key 279 | assert eval_result.grade 280 | 281 | --------------------------------------------------------------------------------