├── .python-version ├── media ├── kronologs-graph.webp ├── event-deep-research.webp └── kronologs-lgstudiograph.webp ├── index.py ├── Makefile ├── .gitignore ├── .env.example ├── .github └── repository.yml ├── langgraph.json ├── src ├── core │ └── error_handling.py ├── services │ ├── url_service.py │ └── event_service.py ├── research_events │ ├── merge_events │ │ ├── show.json │ │ ├── utils.py │ │ ├── shortcategorized.json │ │ ├── prompts.py │ │ ├── fullcategorized.json │ │ ├── merge_events_graph.py │ │ └── test.json │ ├── chunk_graph.py │ ├── result.json │ └── research_events_graph.py ├── url_crawler │ ├── url_krawler_graph.py │ ├── prompts.py │ └── utils.py ├── llm_service.py ├── utils.py ├── configuration.py ├── test │ ├── test_enhanced_merge_events.py │ ├── test_url_crawler.py │ ├── test_merge_events.py │ └── test_research_events.py ├── prompts.py ├── state.py └── graph.py ├── LICENSE.TXT ├── AGENTS.md ├── pyproject.toml ├── scripts └── geocode.py └── README.md /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /media/kronologs-graph.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernatsampera/event-deep-research/HEAD/media/kronologs-graph.webp -------------------------------------------------------------------------------- /media/event-deep-research.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernatsampera/event-deep-research/HEAD/media/event-deep-research.webp -------------------------------------------------------------------------------- /media/kronologs-lgstudiograph.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernatsampera/event-deep-research/HEAD/media/kronologs-lgstudiograph.webp -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | from langchain_core.messages import ToolMessage 2 | 3 | message = ToolMessage(content="Test", tool_call_id="123") 4 | 5 | 6 | print(message) 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dev: 6 | source .venv/bin/activate && uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.12 langgraph dev --allow-blocking 7 | 8 | 9 | 10 | test: 11 | uv run pytest -v -s -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | 12 | 13 | **/.vscode/ 14 | 15 | **/.env 16 | 17 | 18 | backend/scripts/ 19 | backend/src/data/* 20 | 21 | .langgraph_api/** 22 | 23 | 24 | src/data/* 25 | 26 | .langgraph_api/* 27 | .pytest_cache/* -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | 2 | # Web scraping and search APIs 3 | FIRECRAWL_BASE_URL="https://api.firecrawl.dev" 4 | FIRECRAWL_API_KEY="" 5 | TAVILY_API_KEY= 6 | 7 | # LLM Provider API Keys (choose one or more) 8 | OPENAI_API_KEY= 9 | ANTHROPIC_API_KEY= 10 | GOOGLE_API_KEY= 11 | 12 | # Optional: Langfuse for observability 13 | LANGFUSE_PUBLIC_KEY="" 14 | LANGFUSE_SECRET_KEY="" 15 | LANGFUSE_HOST="" 16 | 17 | 18 | -------------------------------------------------------------------------------- /.github/repository.yml: -------------------------------------------------------------------------------- 1 | # GitHub repository metadata 2 | name: event-deep-research 3 | description: AI-powered agent that automatically researches historical figures and creates structured biographical timelines from web sources 4 | topics: 5 | - ai-agents 6 | - event-extraction 7 | - knowledge-graphs 8 | - web-scraping 9 | - langgraph 10 | - biographical-research 11 | - automated-research 12 | - llm-agents 13 | - timeline-generation 14 | -------------------------------------------------------------------------------- /langgraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "dockerfile_lines": [], 3 | "graphs": { 4 | "supervisor": "./src/graph.py:graph", 5 | "research_events": "./src/research_events/research_events_graph.py:research_events_app", 6 | "merge_events_graph": "./src/research_events/merge_events/merge_events_graph.py:merge_events_app", 7 | "url_crawler": "./src/url_crawler/url_krawler_graph.py:url_crawler_app", 8 | "chunk_graph": "./src/research_events/chunk_graph.py:graph" 9 | }, 10 | "python_version": "3.12", 11 | "env": ".env", 12 | "dependencies": ["."], 13 | "auth": {} 14 | } 15 | -------------------------------------------------------------------------------- /src/core/error_handling.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from typing import Any, Dict 3 | 4 | from langgraph.types import Command 5 | 6 | 7 | class GraphError(Exception): 8 | def __init__(self, message: str, node: str, state: dict): 9 | self.message = message 10 | self.node = node 11 | self.state = state 12 | super().__init__(f"Error in {node}: {message}") 13 | 14 | 15 | def with_error_handling(func): 16 | @wraps(func) 17 | async def wrapper(state: Dict[str, Any], config) -> Command: 18 | try: 19 | return await func(state, config) 20 | except Exception as e: 21 | error_info = { 22 | "error": str(e), 23 | "node": func.__name__, 24 | "state_snapshot": state, 25 | } 26 | return Command(goto="error_handler", update=error_info) 27 | 28 | return wrapper 29 | -------------------------------------------------------------------------------- /src/services/url_service.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | from typing import List 3 | 4 | 5 | class URLService: 6 | @staticmethod 7 | def extract_domain(url: str) -> str: 8 | """Extract domain from URL.""" 9 | return urlparse(url).netloc 10 | 11 | @staticmethod 12 | def update_url_list(urls: List[str], used_domains: List[str]) -> tuple[List[str], List[str]]: 13 | """Remove first URL from list and track its domain.""" 14 | if not urls: 15 | return urls, used_domains 16 | 17 | url = urls[0] 18 | domain = URLService.extract_domain(url) 19 | 20 | # Track used domains 21 | updated_used_domains = used_domains.copy() 22 | if domain not in updated_used_domains: 23 | updated_used_domains.append(domain) 24 | 25 | # Remove first URL 26 | remaining_urls = urls[1:] 27 | 28 | return remaining_urls, updated_used_domains -------------------------------------------------------------------------------- /src/research_events/merge_events/show.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "Henry Miller was born", 4 | "description": "Henry Valentine Miller was born on December 26 in New York City.", 5 | "date": { 6 | "year": 1891, 7 | "note": "December 26" 8 | }, 9 | "location": "New York City", 10 | "id": "henry_miller_born", 11 | "new_information": "None" 12 | }, 13 | { 14 | "name": "Divorced Beatrice Sylvas Wickens", 15 | "description": "Miller was divorced from Beatrice Sylvas Wickens.", 16 | "date": { 17 | "year": 1923, 18 | "note": "December 21" 19 | }, 20 | "location": "New York", 21 | "id": "henry_miller_divorced_beatrice", 22 | "new_information": "None" 23 | }, 24 | { 25 | "name": "Daughter Barbara born", 26 | "description": "Miller and Beatrice had a daughter named Barbara.", 27 | "date": { 28 | "year": 1919, 29 | "note": "" 30 | }, 31 | "location": "New York", 32 | "id": "henry_miller_daughter_barbara_born", 33 | "new_information": "None" 34 | } 35 | ] 36 | -------------------------------------------------------------------------------- /src/services/event_service.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from src.state import CategoriesWithEvents 3 | 4 | 5 | class EventService: 6 | @staticmethod 7 | def split_events_into_chunks(extracted_events: str, max_len: int = 2000) -> List[str]: 8 | """Split events text into chunks of specified length.""" 9 | return [ 10 | extracted_events[i : i + max_len] 11 | for i in range(0, len(extracted_events), max_len) 12 | ] 13 | 14 | @staticmethod 15 | def merge_categorized_events(categorized_results: List[CategoriesWithEvents]) -> CategoriesWithEvents: 16 | """Merge multiple categorized event results into one.""" 17 | merged = CategoriesWithEvents( 18 | early="[]", 19 | personal="[]", 20 | career="[]", 21 | legacy="[]", 22 | ) 23 | 24 | for result in categorized_results: 25 | merged.early += result.early 26 | merged.personal += result.personal 27 | merged.career += result.career 28 | merged.legacy += result.legacy 29 | 30 | return merged -------------------------------------------------------------------------------- /LICENSE.TXT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Bernat Sampera 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # Development Guidelines for Deep Event Research 2 | 3 | ## Build/Test Commands 4 | - **Run all tests**: `make test` or `uv run pytest -v -s` 5 | - **Run single test**: `uv run pytest src/test/test_file.py::test_function -v` 6 | - **Run tests without LLM calls**: `uv run pytest -v -m 'not llm'` 7 | - **Run LLM integration tests**: `uv run pytest -v -m llm` 8 | - **Lint code**: `uv run ruff check src/` 9 | - **Format code**: `uv run ruff format src/` 10 | - **Start dev server**: `make dev` 11 | 12 | ## Code Style Guidelines 13 | - **Python**: 3.12+ with type hints required 14 | - **Imports**: Use `from src.module import name` for internal imports, standard library first 15 | - **Formatting**: Ruff with Google docstring convention 16 | - **Error handling**: Use `@with_error_handling` decorator for graph nodes, raise `GraphError` for known failures 17 | - **Async**: All graph functions must be async and return `Command` 18 | - **Testing**: Use pytest with asyncio mode, mock LLM calls by default, mark real LLM tests with `@pytest.mark.llm` 19 | - **State management**: Use TypedDict classes from `src.state.py` for all state objects 20 | - **Services**: Static methods in service classes, no instance state -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "backend" 3 | version = "0.1.0" 4 | description = "AI-powered agent that automatically researches historical figures and creates structured biographical timelines from web sources" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "python-dotenv>=0.9.9", 9 | "langchain>=0.3.27", 10 | "langchain-google-genai>=2.1.9", 11 | "langgraph>=0.6.7", 12 | "langgraph-cli", 13 | "langchain-openai>=0.3.33", 14 | "langchain-ollama>=0.3.8", 15 | "langchain-tavily>=0.2.11", 16 | "tiktoken>=0.11.0", 17 | "pytest>=8.4.2", 18 | "pytest-asyncio>=0.24.0", 19 | "langfuse>=3.5.2", 20 | "aiohttp>=3.8.0", 21 | ] 22 | 23 | [build-system] 24 | requires = ["hatchling"] 25 | build-backend = "hatchling.build" 26 | 27 | [tool.hatch.build.targets.wheel] 28 | packages = ["src"] 29 | 30 | [tool.ruff] 31 | src = ["src"] 32 | lint.select = [ 33 | "E", # pycodestyle 34 | "F", # pyflakes 35 | "I", # isort 36 | "D", # pydocstyle 37 | "D401", # First line should be in imperative mood 38 | "T201", 39 | "UP", 40 | ] 41 | lint.ignore = [ 42 | "UP006", 43 | "UP007", 44 | "UP035", 45 | "D417", 46 | "E501", 47 | ] 48 | 49 | [tool.ruff.lint.per-file-ignores] 50 | "tests/*" = ["D", "UP"] 51 | 52 | [tool.ruff.lint.pydocstyle] 53 | convention = "google" 54 | 55 | [tool.pytest.ini_options] 56 | testpaths = ["src/test"] 57 | pythonpath = ["src"] 58 | asyncio_mode = "auto" 59 | addopts = "-v -m 'not llm'" 60 | markers = [ 61 | "llm: marks tests that make real LLM API calls" 62 | ] 63 | 64 | [tool.setuptools.packages.find] 65 | where = ["src"] 66 | 67 | -------------------------------------------------------------------------------- /src/research_events/merge_events/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Type, TypeVar, Union 2 | 3 | from pydantic import BaseModel 4 | from src.state import CategoriesWithEvents 5 | 6 | T = TypeVar("T", bound=BaseModel) 7 | 8 | 9 | def ensure_pydantic_model(data: Union[dict, T], model_class: Type[T]) -> T: 10 | """Converts a dictionary to a Pydantic model instance if needed. 11 | If the data is already an instance of the model class, returns it as-is. 12 | 13 | Args: 14 | data: Either a dictionary or an instance of the Pydantic model 15 | model_class: The Pydantic model class to convert to 16 | 17 | Returns: 18 | An instance of the Pydantic model class 19 | 20 | Examples: 21 | # Convert dict to CategoriesWithEvents 22 | events = ensure_pydantic_model(some_dict, CategoriesWithEvents) 23 | 24 | # If already a model, returns as-is 25 | events = ensure_pydantic_model(existing_model, CategoriesWithEvents) 26 | """ 27 | if isinstance(data, dict): 28 | return model_class(**data) 29 | elif isinstance(data, model_class): 30 | return data 31 | else: 32 | # Handle other cases - try to convert to dict first 33 | if hasattr(data, "__dict__"): 34 | return model_class(**data.__dict__) 35 | else: 36 | raise TypeError(f"Cannot convert {type(data)} to {model_class}") 37 | 38 | 39 | # This function is needed because sometimes the object comes as a dict and then it's tricky to access the variables. 40 | # There has to be a better way to do this in python, but this is the best I can come up with for now. 41 | def ensure_categories_with_events( 42 | data: Union[dict, CategoriesWithEvents], 43 | ) -> "CategoriesWithEvents": 44 | """Specifically converts data to CategoriesWithEvents model.""" 45 | return ensure_pydantic_model(data, CategoriesWithEvents) 46 | -------------------------------------------------------------------------------- /src/url_crawler/url_krawler_graph.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Literal, TypedDict 3 | 4 | from langgraph.graph import END, START, StateGraph 5 | from langgraph.graph.state import Command 6 | from src.configuration import Configuration 7 | from src.url_crawler.utils import url_crawl 8 | from src.utils import get_langfuse_handler 9 | 10 | config = Configuration() 11 | MAX_CONTENT_LENGTH = config.max_content_length 12 | 13 | 14 | class InputUrlCrawlerState(TypedDict): 15 | url: str 16 | research_question: str 17 | 18 | 19 | class UrlCrawlerState(InputUrlCrawlerState): 20 | raw_scraped_content: str 21 | 22 | 23 | class OutputUrlCrawlerState(UrlCrawlerState): 24 | extracted_events: str 25 | raw_scraped_content: str 26 | 27 | 28 | async def scrape_content(state: UrlCrawlerState) -> Command[Literal["__end__"]]: 29 | """Scrapes URL content and returns it without any processing.""" 30 | url = state.get("url", "") 31 | 32 | content = await url_crawl(url) 33 | 34 | if len(content) > MAX_CONTENT_LENGTH: 35 | # At random start to get diverse content 36 | start_index = random.randint(0, len(content) - MAX_CONTENT_LENGTH) 37 | content = content[start_index : start_index + MAX_CONTENT_LENGTH] 38 | 39 | return Command( 40 | goto=END, 41 | update={ 42 | "raw_scraped_content": content, 43 | "extracted_events": content, # For compatibility with existing interface 44 | }, 45 | ) 46 | 47 | 48 | builder = StateGraph( 49 | UrlCrawlerState, 50 | input_schema=InputUrlCrawlerState, 51 | output_schema=OutputUrlCrawlerState, 52 | config_schema=Configuration, 53 | ) 54 | 55 | builder.add_node("scrape_content", scrape_content) 56 | builder.add_edge(START, "scrape_content") 57 | 58 | 59 | url_crawler_app = builder.compile().with_config({"callbacks": [get_langfuse_handler()]}) 60 | -------------------------------------------------------------------------------- /src/url_crawler/prompts.py: -------------------------------------------------------------------------------- 1 | # --- Prompt 1: For extracting events from a small text chunk --- 2 | 3 | # --- Prompt 1: For extracting events from a small text chunk --- 4 | EXTRACT_EVENTS_PROMPT = """ 5 | You are a Biographical Event Extractor. Your single focus is to find events that directly answer the research question: **"{research_question}"** 6 | 7 | 8 | 9 | - `RelevantChunk` (use this if the text is almost entirely relevant (>80%)) 10 | - `PartialChunk` (use this if the text is a mix of relevant and irrelevant content) 11 | - `IrrelevantChunk` (use this if the text contains no events that are relevant to the biography of the person in the researc question) 12 | 13 | 14 | 15 | **EXTRACTION RULE for `PartialChunk`**: You *must* extract the complete relevant sentences, including all details like dates, names, locations, and context. Do not summarize. 16 | 17 | 18 | {text_chunk} 19 | 20 | 21 | You must call exactly one of the provided tools. Do not respond with plain text. 22 | Choose only the tool call and the tool call arguments. 23 | """ 24 | # src/url_crawler/prompts.py 25 | 26 | create_event_list_prompt = """You are a biographical assistant. Your task is to convert blocks of text that contains events of a person into single events where the date, description of the event, location of the event are included for {research_question}. 27 | 28 | **Instructions**: 29 | - Analyze the "New Extracted Events" and convert them into single events where the date, description of the event, location of the event are included. 30 | - **MAINTAIN** a chronological order. 31 | 32 | **Output Format**: 33 | - A single, comprehensive, and chronological list in bullet points. 34 | 35 | 36 | New Extracted Events: 37 | ---- 38 | {newly_extracted_events} 39 | 40 | 41 | 42 | 43 | Provide the single, consolidated, and chronological list of biographical events. 44 | 45 | """ 46 | -------------------------------------------------------------------------------- /src/research_events/merge_events/shortcategorized.json: -------------------------------------------------------------------------------- 1 | { 2 | "existing_events": { 3 | "early": "- Henry Miller was born: Henry Valentine Miller was born on December 26 in New York City. (1891, New York City)\n- Attended Eastern District High School: Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school. (early 1900s, Williamsburg, Brooklyn)", 4 | "personal": "- Met June Mansfield: Miller met and became enamored of June Mansfield (born Juliet Edith Smerth) at a dance hall. (1923, New York)\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)", 5 | "career": "- Published Tropic of Cancer: Miller's first published book, 'Tropic of Cancer,' was published by Obelisk Press (banned in the United States). (1934, Paris, France)\n- Tropic of Cancer published in the US: 'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials. (1961, United States)", 6 | "legacy": "- Nominated for Nobel Prize: Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip. (1973, )\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)" 7 | }, 8 | "new_events": "- Met Robert W. Service in Paris: Miller met author Robert W. Service on a Paris street and discussed books, with Service recalling the encounter in his autobiography. (1928, Paris)\n- Financial support from Anaïs Nin: Anaïs Nin, with Hugh Guiler, financed Miller's living expenses through the 1930s, including rent for an apartment at 18 Villa Seurat, and funded the first printing of Tropic of Cancer in 1934. (1930s, Paris)\n- Learned about and promoted George Dibbern: During the late 1930s, Miller learned about German-born sailor George Dibbern, helped promote his memoir Quest, and organized charity to help him. (late 1930s, France)\n- Wrote The Smile at the Foot of the Ladder: In 1948, Miller wrote a novella he called his 'most singular story,' The Smile at the Foot of the Ladder. (1948, California)" 9 | } 10 | -------------------------------------------------------------------------------- /scripts/geocode.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # Input JSON file from current directory 4 | import os 5 | import time 6 | 7 | import requests 8 | 9 | INPUT_FILE = os.path.join(os.path.dirname(__file__), "events.json") 10 | OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "events_with_coords.json") 11 | 12 | 13 | # Geocoding function using OpenStreetMap Nominatim 14 | def geocode_location(location_name: str): 15 | url = "https://nominatim.openstreetmap.org/search" 16 | params = {"q": location_name, "format": "json", "limit": 1} 17 | headers = {"User-Agent": "EventGeocoder/1.0"} 18 | 19 | try: 20 | response = requests.get(url, params=params, headers=headers) 21 | response.raise_for_status() 22 | data = response.json() 23 | if data: 24 | return float(data[0]["lat"]), float(data[0]["lon"]) 25 | except Exception as e: 26 | print(f"Error geocoding {location_name}: {e}") 27 | 28 | return None, None 29 | 30 | 31 | def main(): 32 | # Load events 33 | with open(INPUT_FILE, encoding="utf-8") as f: 34 | events = json.load(f) 35 | 36 | # Process each event 37 | for i, event in enumerate(events, 1): 38 | # Add auto-incremental ID 39 | event["id"] = i 40 | 41 | # Only process location if it's not empty 42 | if isinstance(event.get("location"), str) and event["location"].strip(): 43 | loc_name = event["location"] 44 | print(f"Geocoding: {loc_name} ...") 45 | lat, lng = geocode_location(loc_name) 46 | if lat and lng: 47 | event["location"] = {"name": loc_name, "lat": lat, "lng": lng} 48 | else: 49 | event["location"] = {"name": loc_name, "lat": None, "lng": None} 50 | 51 | # Be polite to the API 52 | time.sleep(1) 53 | else: 54 | # Remove location field entirely for empty locations 55 | if "location" in event: 56 | del event["location"] 57 | 58 | # Save new JSON 59 | with open(OUTPUT_FILE, "w", encoding="utf-8") as f: 60 | json.dump(events, f, indent=2, ensure_ascii=False) 61 | 62 | print(f"✅ Saved updated events to {OUTPUT_FILE}") 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /src/research_events/chunk_graph.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, TypedDict 2 | 3 | from langgraph.graph import END, START, StateGraph 4 | from langgraph.graph.state import CompiledStateGraph 5 | from pydantic import BaseModel, Field 6 | from src.configuration import Configuration 7 | from src.llm_service import create_llm_chunk_model 8 | 9 | 10 | class BiographicEventCheck(BaseModel): 11 | contains_biographic_event: bool = Field( 12 | description="Whether the text chunk contains biographical events" 13 | ) 14 | 15 | 16 | class ChunkResult(BaseModel): 17 | content: str 18 | contains_biographic_event: bool = Field( 19 | description="Whether the text chunk contains biographical events" 20 | ) 21 | 22 | 23 | class ChunkState(TypedDict): 24 | text: str 25 | chunks: List[str] 26 | results: Dict[str, ChunkResult] 27 | 28 | 29 | def split_text(state: ChunkState) -> ChunkState: 30 | """Split text into smaller chunks.""" 31 | text = state["text"] 32 | chunk_size = 2000 33 | chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] 34 | return {"chunks": chunks} 35 | 36 | 37 | def check_chunk_for_events(state: ChunkState, config) -> ChunkState: 38 | """Check each chunk for biographical events using structured output.""" 39 | model = create_llm_chunk_model(config, BiographicEventCheck) 40 | results = {} 41 | 42 | for i, chunk in enumerate(state["chunks"]): 43 | prompt = f""" 44 | Analyze this text chunk and determine if it contains SPECIFIC biographical events. 45 | 46 | ONLY mark as true if the chunk contains: 47 | - Birth/death dates or locations 48 | - Marriage ceremonies or relationships 49 | - Educational enrollment or graduation 50 | - Career appointments or job changes 51 | - Awards, prizes, or honors received 52 | - Relocations to new cities/countries 53 | - Major discoveries or inventions 54 | 55 | DO NOT mark as true for: 56 | - General descriptions or background information 57 | - Character traits or personality descriptions 58 | - General statements about time periods 59 | - Descriptions of places without personal connection 60 | - General knowledge or context 61 | 62 | The event must be specific and concrete, not general background. 63 | 64 | Text chunk: "{chunk}" 65 | """ 66 | 67 | result = model.invoke(prompt) 68 | results[f"chunk_{i}"] = ChunkResult( 69 | content=chunk, contains_biographic_event=result.contains_biographic_event 70 | ) 71 | 72 | return {"results": results} 73 | 74 | 75 | def create_biographic_event_graph() -> CompiledStateGraph: 76 | """Create and return the biographic event detection graph.""" 77 | graph = StateGraph(ChunkState, config_schema=Configuration) 78 | 79 | graph.add_node("split_text", split_text) 80 | graph.add_node("check_events", check_chunk_for_events) 81 | 82 | graph.add_edge(START, "split_text") 83 | graph.add_edge("split_text", "check_events") 84 | graph.add_edge("check_events", END) 85 | 86 | return graph.compile() 87 | 88 | 89 | graph = create_biographic_event_graph() 90 | -------------------------------------------------------------------------------- /src/llm_service.py: -------------------------------------------------------------------------------- 1 | from typing import List, Type 2 | 3 | from langchain.chat_models import init_chat_model 4 | from langchain_core.runnables import Runnable, RunnableConfig 5 | from langchain_core.tools import BaseTool 6 | from pydantic import BaseModel 7 | from src.configuration import Configuration 8 | from src.utils import get_api_key_for_model 9 | 10 | configurable_model = init_chat_model( 11 | configurable_fields=("model", "max_tokens", "api_key", "reasoning") 12 | ) 13 | 14 | 15 | # This contains the shared logic. The underscore _ means other files shouldn't use it. 16 | def _build_and_configure_model( 17 | config: RunnableConfig, 18 | model_chain: Runnable, 19 | model_name: str, 20 | max_tokens: int, 21 | max_retries: int, 22 | ) -> Runnable: 23 | """Internal helper to apply retry and runtime configuration.""" 24 | model_config = { 25 | "model": model_name, 26 | "max_tokens": max_tokens, 27 | "api_key": get_api_key_for_model(model_name, config), 28 | "reasoning": "False", 29 | } 30 | return model_chain.with_retry(stop_after_attempt=max_retries).with_config( 31 | model_config 32 | ) 33 | 34 | 35 | # --- Public Function 1: For Models WITH Tools --- 36 | def create_llm_with_tools( 37 | tools: List[Type[BaseTool]], config: RunnableConfig 38 | ) -> Runnable: 39 | """Creates a model configured specifically for tool-calling.""" 40 | configurable = Configuration.from_runnable_config(config) 41 | 42 | # Start the chain by binding the tools 43 | model_with_tools = configurable_model.bind_tools(tools) 44 | 45 | return _build_and_configure_model( 46 | config=config, 47 | model_chain=model_with_tools, 48 | model_name=configurable.get_llm_with_tools_model(), 49 | max_tokens=configurable.tools_llm_max_tokens, 50 | max_retries=configurable.max_tools_output_retries, 51 | ) 52 | 53 | 54 | # --- Public Function 2: For Models WITHOUT Tools --- 55 | def create_llm_structured_model( 56 | config: RunnableConfig, class_name: Type[BaseModel] | None = None 57 | ) -> Runnable: 58 | """Creates a general-purpose chat model with no tools.""" 59 | configurable = Configuration.from_runnable_config(config) 60 | 61 | # The chain is just the base model itself 62 | if class_name: 63 | base_model = configurable_model.with_structured_output(class_name) 64 | else: 65 | base_model = configurable_model 66 | 67 | return _build_and_configure_model( 68 | config=config, 69 | model_chain=base_model, 70 | model_name=configurable.get_llm_structured_model(), 71 | max_tokens=configurable.structured_llm_max_tokens, 72 | max_retries=configurable.max_structured_output_retries, 73 | ) 74 | 75 | 76 | # --- Public Function 3: For Small Chunk Models --- 77 | def create_llm_chunk_model( 78 | config: RunnableConfig, class_name: Type[BaseModel] | None = None 79 | ) -> Runnable: 80 | """Creates a small model for chunk biographical event detection.""" 81 | configurable = Configuration.from_runnable_config(config) 82 | 83 | # The chain is just the base model itself 84 | if class_name: 85 | base_model = configurable_model.with_structured_output(class_name) 86 | else: 87 | base_model = configurable_model 88 | 89 | return _build_and_configure_model( 90 | config=config, 91 | model_chain=base_model, 92 | model_name=configurable.get_llm_chunk_model(), 93 | max_tokens=1024, # Smaller token limit for chunk processing 94 | max_retries=2, # Fewer retries for chunk processing 95 | ) 96 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from langchain_core.messages import ( 4 | AIMessage, 5 | BaseMessage, 6 | HumanMessage, 7 | SystemMessage, 8 | ToolMessage, 9 | ) 10 | from langchain_core.runnables import RunnableConfig 11 | from langchain_core.tools import tool 12 | 13 | 14 | @tool( 15 | description="Mandatory reflection tool. Analyze results and plan the next search query." 16 | ) 17 | def think_tool(reflection: str) -> str: 18 | """Mandatory reflection step. Use this to analyze the last result, identify gaps, and formulate the EXACT query for the next search. 19 | 20 | You MUST use this tool immediately after every ResearchEventsTool call. 21 | 22 | Analyze if an additional call to the ResearchEventsTool is needed to fill the gaps or the research is completed. When is completed, you must call the FinishResearchTool. 23 | 24 | The `reflection` argument must follow the structure defined in the system prompt, culminating in the precise search query you will use next. 25 | 26 | Args: 27 | reflection: Structured analysis of the last result, current gaps, and the PLANNED QUERY for the next step. 28 | 29 | Returns: 30 | Confirmation and instruction to proceed to the next step. 31 | """ 32 | # The return value is crucial. It becomes the ToolMessage the LLM sees next. 33 | # By explicitly telling it what to do, we break the loop. 34 | return f"Reflection recorded. {reflection}" 35 | 36 | 37 | def get_api_key_for_model(model_name: str, config: RunnableConfig): 38 | """Get API key for a specific model from environment or config.""" 39 | model_name = model_name.lower() 40 | 41 | if model_name.startswith("openai:"): 42 | return os.getenv("OPENAI_API_KEY") 43 | elif model_name.startswith("anthropic:"): 44 | return os.getenv("ANTHROPIC_API_KEY") 45 | elif model_name.startswith("google"): 46 | print("GOOGLE_API_KEY", os.getenv("GOOGLE_API_KEY")) 47 | return os.getenv("GOOGLE_API_KEY") 48 | elif model_name.startswith("ollama:"): 49 | # Ollama doesn't need API key 50 | return None 51 | return None 52 | 53 | 54 | def get_buffer_string_with_tools(messages: list[BaseMessage]) -> str: 55 | """Return a readable transcript showing roles, including tool names for ToolMessages.""" 56 | lines = [] 57 | for m in messages: 58 | if isinstance(m, HumanMessage): 59 | lines.append(f"Human: {m.content}") 60 | elif isinstance(m, AIMessage): 61 | ai_content = f"AI: {m.content}" 62 | # Include tool calls if present 63 | if hasattr(m, "tool_calls") and m.tool_calls: 64 | tool_calls_str = ", ".join( 65 | [ 66 | f"{tc.get('name', 'unknown')}({tc.get('args', {})})" 67 | for tc in m.tool_calls 68 | ] 69 | ) 70 | ai_content += f" [Tool calls: {tool_calls_str}]" 71 | lines.append(ai_content) 72 | elif isinstance(m, SystemMessage): 73 | lines.append(f"System: {m.content}") 74 | elif isinstance(m, ToolMessage): 75 | # Include tool name if available 76 | tool_name = ( 77 | getattr(m, "name", None) or getattr(m, "tool", None) or "unknown_tool" 78 | ) 79 | lines.append(f"Tool[{tool_name}]: {m.content}") 80 | else: 81 | # fallback for unknown or custom message types 82 | lines.append(f"{m.__class__.__name__}: {m.content}") 83 | return "\n".join(lines) 84 | 85 | 86 | def get_langfuse_handler(): 87 | try: 88 | from langfuse.langchain import CallbackHandler 89 | 90 | return CallbackHandler() 91 | except ImportError: 92 | return None 93 | -------------------------------------------------------------------------------- /src/url_crawler/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import re 4 | from typing import List 5 | 6 | import aiohttp 7 | import tiktoken 8 | 9 | FIRECRAWL_API_URL = ( 10 | f"{os.getenv('FIRECRAWL_BASE_URL', 'https://api.firecrawl.dev')}/v0/scrape" 11 | ) 12 | 13 | 14 | async def url_crawl(url: str) -> str: 15 | """Crawls a URL and returns its content. For this example, returns dummy text.""" 16 | # print(f"--- FAKE CRAWLING: {url} ---") 17 | # if "wikipedia" in url: 18 | # return "Henry Miller was an American novelist, short story writer and essayist. He was born in Yorkville, NYC on December 26, 1891. He moved to Paris in 1930. He wrote tropic of cancer, part of his series of novels about his life." 19 | 20 | content = await scrape_page_content(url) 21 | if content is None: 22 | return "" 23 | return remove_markdown_links(content) 24 | 25 | 26 | async def scrape_page_content(url): 27 | """Scrapes URL using Firecrawl API and returns Markdown content.""" 28 | try: 29 | headers = {"Content-Type": "application/json"} 30 | 31 | # Add API key if available 32 | api_key = os.getenv("FIRECRAWL_API_KEY") 33 | if api_key: 34 | headers["Authorization"] = f"Bearer {api_key}" 35 | 36 | async with aiohttp.ClientSession() as session: 37 | async with session.post( 38 | FIRECRAWL_API_URL, 39 | json={ 40 | "url": url, 41 | "pageOptions": {"onlyMainContent": True}, 42 | "formats": ["markdown"], 43 | }, 44 | headers=headers, 45 | timeout=aiohttp.ClientTimeout(total=30), 46 | ) as response: 47 | response.raise_for_status() 48 | data = await response.json() 49 | return data.get("data", {}).get("markdown") 50 | except Exception as e: 51 | print(f"Error scraping page content: {e}") 52 | return None 53 | 54 | 55 | def remove_markdown_links(markdown_text): 56 | """Removes Markdown links, keeping only display text.""" 57 | return re.sub(r"\[(.*?)\]\(.*?\)", r"\1", markdown_text) 58 | 59 | 60 | # Global tokenizer cache to avoid repeated loading 61 | _tokenizer = None 62 | 63 | 64 | def get_tokenizer(): 65 | """Get the tiktoken tokenizer, loading it lazily.""" 66 | global _tokenizer 67 | if _tokenizer is None: 68 | _tokenizer = tiktoken.get_encoding("cl100k_base") 69 | return _tokenizer 70 | 71 | 72 | async def chunk_text_by_tokens( 73 | text: str, chunk_size: int = 1000, overlap_size: int = 20 74 | ) -> List[str]: 75 | """Splits text into token-based, overlapping chunks.""" 76 | if not text: 77 | return [] 78 | 79 | # Load tokenizer in a thread to avoid blocking 80 | encoding = await asyncio.to_thread(get_tokenizer) 81 | tokens = encoding.encode(text) 82 | print("--- TOKENS ---") 83 | print(len(tokens)) 84 | print("--- TOKENS ---") 85 | chunks = [] 86 | start_index = 0 87 | while start_index < len(tokens): 88 | end_index = start_index + chunk_size 89 | chunk_tokens = tokens[start_index:end_index] 90 | chunks.append(encoding.decode(chunk_tokens)) 91 | start_index += chunk_size - overlap_size 92 | 93 | print(f""" 94 | CHUNKED TEXT --- 95 | Chunks:{len(chunks)} 96 | Tokens: {len(tokens)} 97 | Text: {len(text)} 98 | --- 99 | """) 100 | return chunks 101 | 102 | 103 | async def count_tokens(messages: List[str]) -> int: 104 | """Counts the total tokens in a list of messages.""" 105 | # Load tokenizer in a thread to avoid blocking 106 | encoding = await asyncio.to_thread(get_tokenizer) 107 | return sum(len(encoding.encode(msg)) for msg in messages) 108 | -------------------------------------------------------------------------------- /src/research_events/merge_events/prompts.py: -------------------------------------------------------------------------------- 1 | categorize_events_prompt = """ 2 | You are a helpful assistant that will categorize the events into the 4 categories. 3 | 4 | 5 | {events} 6 | 7 | 8 | 9 | early: Covers childhood, upbringing, family, education, and early influences that shaped the author. 10 | personal: Focuses on relationships, friendships, family life, places of residence, and notable personal traits or beliefs. 11 | career: Details their professional journey: first steps into writing, major publications, collaborations, recurring themes, style, and significant milestones. 12 | legacy: Explains how their work was received, awards or recognition, cultural/literary impact, influence on other authors, and they are remembered today. 13 | 14 | 15 | 16 | 17 | INCLUDE ALL THE INFORMATION FROM THE EVENTS, do not abbreviate or omit any information. 18 | 19 | """ 20 | 21 | EXTRACT_AND_CATEGORIZE_PROMPT = """ 22 | You are a Biographical Event Extractor and Categorizer. Your task is to analyze text chunks for events related to the life of the historical figure** 23 | 24 | 25 | - `IrrelevantChunk` (use if the text contains NO biographical events relevant to the research question) 26 | - `RelevantEventsCategorized` (use if the text contains relevant events - categorize them into the 4 categories) 27 | 28 | 29 | 30 | early: Covers childhood, upbringing, family, education, and early influences that shaped the author. 31 | personal: Focuses on relationships, friendships, family life, places of residence, and notable personal traits or beliefs. 32 | career: Details their professional journey: first steps into writing, major publications, collaborations, recurring themes, style, and significant milestones. 33 | legacy: Explains how their work was received, awards or recognition, cultural/literary impact, influence on other authors, and how they are remembered today. 34 | 35 | 36 | **EXTRACTION RULES**: 37 | - Extract COMPLETE sentences with ALL available details (dates, names, locations, context, emotions, motivations) 38 | - Include surrounding context that makes the event meaningful and complete 39 | - Preserve the original narrative flow and descriptive language 40 | - Capture cause-and-effect relationships and consequences 41 | - Include only events directly relevant to the research question 42 | - Maintain chronological order within each category 43 | - Format as clean bullet points with complete, detailed descriptions (e.g., "- In the spring of 1965, while living in a small apartment in Paris, she attended a poetry reading that fundamentally changed her approach to writing, inspiring her to experiment with free verse.") 44 | - IMPORTANT: Return each category as a SINGLE string containing all bullet points, not as a list 45 | 46 | 47 | {text_chunk} 48 | 49 | 50 | You must call exactly one of the provided tools. Do not respond with plain text. 51 | """ 52 | 53 | 54 | MERGE_EVENTS_TEMPLATE = """You are a helpful assistant that will merge two lists of events: 55 | the original events (which must always remain) and new events (which may contain extra details). 56 | The new events should only be treated as additions if they provide relevant new information. 57 | The final output must preserve the original events and seamlessly add the new ones if applicable. 58 | 59 | 60 | - Always include the original events exactly, do not omit or alter them. 61 | - Add new events only if they are not duplicates, combining details if they overlap. 62 | - Format the final list as bullet points, one event per line (e.g., "- Event details."). 63 | - Keep the list clean, concise, and without commentary. 64 | 65 | 66 | 67 | Original events: 68 | {original} 69 | 70 | New events: 71 | {new} 72 | 73 | 74 | 75 | Return only the merged list of events as bullet points, nothing else. 76 | """ 77 | -------------------------------------------------------------------------------- /src/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any 3 | 4 | from langchain_core.runnables import RunnableConfig 5 | from pydantic import BaseModel, Field 6 | 7 | 8 | class Configuration(BaseModel): 9 | """Main configuration class for the Deep Research agent.""" 10 | 11 | # Single model for most providers (simplified configuration) 12 | llm_model: str = Field( 13 | default="google_genai:gemini-2.5-flash", 14 | description="Primary LLM model to use for both structured output and tools", 15 | ) 16 | 17 | # Optional overrides 18 | structured_llm_model: str | None = Field( 19 | default=None, 20 | # default="ollama:mistral-nemo:latest", 21 | description="Override model for structured output", 22 | ) 23 | tools_llm_model: str | None = Field( 24 | default=None, 25 | # default="ollama:gpt-oss:20b", 26 | description="Override model for tools", 27 | ) 28 | chunk_llm_model: str | None = Field( 29 | default=None, 30 | # default="ollama:gemma3:4b", 31 | description="Small model for chunk biographical event detection", 32 | ) 33 | 34 | structured_llm_max_tokens: int = Field( 35 | default=4096, description="Maximum tokens for structured output model" 36 | ) 37 | tools_llm_max_tokens: int = Field( 38 | default=4096, description="Maximum tokens for tools model" 39 | ) 40 | 41 | max_structured_output_retries: int = Field( 42 | default=3, description="Maximum retry attempts for structured output" 43 | ) 44 | max_tools_output_retries: int = Field( 45 | default=3, description="Maximum retry attempts for tool calls" 46 | ) 47 | 48 | # Hardcoded values from graph files 49 | default_chunk_size: int = Field( 50 | default=800, description="Default chunk size for text processing" 51 | ) 52 | default_overlap_size: int = Field( 53 | default=20, description="Default overlap size between chunks" 54 | ) 55 | max_content_length: int = Field( 56 | default=100000, description="Maximum content length to process" 57 | ) 58 | max_tool_iterations: int = Field( 59 | default=5, description="Maximum number of tool iterations" 60 | ) 61 | max_chunks: int = Field( 62 | default=20, 63 | description="Maximum number of chunks to process for biographical event detection", 64 | ) 65 | 66 | def get_llm_structured_model(self) -> str: 67 | """Get the LLM structured model, using overrides if provided.""" 68 | print(f"Getting LLM structured model: {self.structured_llm_model}") 69 | if self.structured_llm_model: 70 | return self.structured_llm_model 71 | 72 | return self.llm_model 73 | 74 | def get_llm_with_tools_model(self) -> str: 75 | """Get the LLM with tools model, using overrides if provided.""" 76 | print(f"Getting LLM with tools model: {self.tools_llm_model}") 77 | if self.tools_llm_model: 78 | return self.tools_llm_model 79 | 80 | return self.llm_model 81 | 82 | def get_llm_chunk_model(self) -> str: 83 | """Get the LLM chunk model, using overrides if provided.""" 84 | if self.chunk_llm_model: 85 | return self.chunk_llm_model 86 | return "ollama:gemma3:4b" 87 | 88 | @classmethod 89 | def from_runnable_config( 90 | cls, config: RunnableConfig | None = None 91 | ) -> "Configuration": 92 | """Create a Configuration instance from a RunnableConfig.""" 93 | configurable = config.get("configurable", {}) if config else {} 94 | field_names = list(cls.model_fields.keys()) 95 | values: dict[str, Any] = { 96 | field_name: os.environ.get(field_name.upper(), configurable.get(field_name)) 97 | for field_name in field_names 98 | } 99 | return cls(**{k: v for k, v in values.items() if v is not None}) 100 | -------------------------------------------------------------------------------- /src/test/test_enhanced_merge_events.py: -------------------------------------------------------------------------------- 1 | """Tests for the enhanced merge events graph.""" 2 | 3 | from unittest.mock import AsyncMock, Mock, patch 4 | 5 | import pytest 6 | from src.state import CategoriesWithEvents 7 | from src.research_events.merge_events.merge_events_graph import merge_events_app 8 | 9 | 10 | @pytest.fixture 11 | def sample_merge_input_state() -> dict: 12 | """Provide a sample input state for the enhanced merge events graph.""" 13 | return { 14 | "existing_events": CategoriesWithEvents( 15 | early="Born in 1920 in Paris.", 16 | personal="Married in 1945.", 17 | career="Published first novel in 1950.", 18 | legacy="Won Nobel Prize in 1980.", 19 | ), 20 | "extracted_events": "He was born in New York City in 1920 and started writing at age 15. He published his first book in 1950 and won the Pulitzer Prize in 1985.", 21 | "research_question": "Research the life of the author", 22 | } 23 | 24 | 25 | class MockToolCall: 26 | """Mock tool call for structured LLM responses.""" 27 | 28 | def __init__(self, name, args): 29 | """Initialize mock tool call with name and args.""" 30 | self.name = name 31 | self.args = args 32 | 33 | def __getitem__(self, key): 34 | """Make the mock tool call subscriptable.""" 35 | if key == "name": 36 | return self.name 37 | elif key == "args": 38 | return self.args 39 | else: 40 | raise KeyError(f"Key {key} not found in MockToolCall") 41 | 42 | 43 | class MockToolResponse: 44 | """Mock tool response for structured LLM responses.""" 45 | 46 | def __init__(self, tool_calls=None): 47 | """Initialize mock tool response with tool calls.""" 48 | self.tool_calls = tool_calls or [] 49 | 50 | 51 | @pytest.mark.asyncio 52 | async def test_enhanced_merge_events_with_mocked_llm(sample_merge_input_state: dict): 53 | """Unit test for the enhanced merge events graph with mocked dependencies.""" 54 | # --- Act: Execute the graph with patched dependencies --- 55 | with patch( 56 | "src.research_events.merge_events.merge_events_graph.create_tools_model" 57 | ) as mock_tools_model: 58 | 59 | # Mock the tools model response for categorization 60 | mock_tools_response = MockToolResponse([ 61 | MockToolCall("RelevantEventsCategorized", { 62 | "early": "- Born in New York City in 1920", 63 | "personal": "", 64 | "career": "- Published first book in 1950", 65 | "legacy": "- Won Pulitzer Prize in 1985", 66 | }) 67 | ]) 68 | mock_tools_instance = AsyncMock() 69 | mock_tools_instance.ainvoke.return_value = mock_tools_response 70 | mock_tools_model.return_value = mock_tools_instance 71 | 72 | result = await merge_events_app.ainvoke(sample_merge_input_state) 73 | 74 | # --- Assert: Verify the output --- 75 | assert "existing_events" in result 76 | existing_events = result["existing_events"] 77 | assert isinstance(existing_events, CategoriesWithEvents) 78 | 79 | # Verify that the tools model was called for categorization 80 | mock_tools_instance.ainvoke.assert_called() 81 | 82 | 83 | @pytest.mark.asyncio 84 | async def test_enhanced_merge_events_with_empty_content(): 85 | """Test enhanced merge events with empty extracted content.""" 86 | input_state = { 87 | "existing_events": CategoriesWithEvents( 88 | early="Born in 1920.", 89 | personal="Married in 1945.", 90 | career="Published in 1950.", 91 | legacy="Won prize in 1980.", 92 | ), 93 | "extracted_events": "", # Empty content 94 | "research_question": "Test question", 95 | } 96 | 97 | result = await merge_events_app.ainvoke(input_state) 98 | 99 | # Should return existing events unchanged 100 | assert "existing_events" in result 101 | existing_events = result["existing_events"] 102 | assert existing_events.early == "Born in 1920." 103 | assert existing_events.personal == "Married in 1945." 104 | assert existing_events.career == "Published in 1950." 105 | assert existing_events.legacy == "Won prize in 1980." -------------------------------------------------------------------------------- /src/prompts.py: -------------------------------------------------------------------------------- 1 | lead_researcher_prompt = """ 2 | You are a meticulous research agent. Your primary directive is to follow a strict, state-based execution cycle to build a comprehensive event timeline for: **{person_to_research}**. 3 | 4 | ** 5 | On every turn, you MUST follow these steps in order: 6 | 7 | 1. **Step 1: Check for Completion.** 8 | * Examine the ``. If it explicitly states the research is COMPLETE, you MUST immediately call the `FinishResearchTool` and stop. 9 | 10 | 11 | **CRITICAL CONSTRAINTS:** 12 | * NEVER call `ResearchEventsTool` twice in a row. 13 | * NEVER call `think_tool` twice in a row. 14 | * ALWAYS call exactly ONE tool per turn. 15 | 16 | 17 | {events_summary} 18 | 19 | 20 | 21 | {last_message} 22 | 23 | 24 | 25 | 26 | 27 | **** 28 | * `ResearchEventsTool`: Finds events about the historical figure. 29 | * `FinishResearchTool`: Ends the research process. Call this ONLY when the research is complete 30 | * `think_tool`: Use this to analyze results and plan the EXACT search query for your next action. 31 | 32 | **CRITICAL: Use think_tool before calling ResearchEventsTool to plan your approach, and after each ResearchEventsTool to assess progress. Do not call think_tool two times in a row.** 33 | 34 | 35 | 1. **Top Priority Gap:** Identify the SINGLE most important missing piece of information from the ``. 36 | 2 **Planned Query:** Write the EXACT search query you will use in the next `ResearchEventsTool` call to fill that gap. 37 | 38 | **CRITICAL:** Execute ONLY ONE tool call now, following the ``. 39 | """ 40 | 41 | 42 | create_messages_summary_prompt = """You are a specialized assistant that maintains a summary of the conversation between the user and the assistant. 43 | 44 | 45 | 1. AI Call: Order to call the ResearchEventsTool, the assistant asked the user for the research question. 46 | 2. Tool Call: The assistant called the ResearchEventsTool with the research question. 47 | 3. AI Call: Order to call think_tool to analyze the results and plan the next action. 48 | 4. Tool Call: The assistant called the think_tool. 49 | ... 50 | 51 | 52 | 53 | {previous_messages_summary} 54 | 55 | 56 | 57 | {new_messages} 58 | 59 | 60 | 61 | Return just the new log entry with it's corresponding number and content. 62 | Do not include Ids of tool calls 63 | 64 | 65 | 66 | X. 67 | 68 | 69 | Output: 70 | """ 71 | 72 | 73 | events_summarizer_prompt = """ 74 | Analyze the following events and identify only the 2 biggest gaps in information. Be brief and general. 75 | 76 | **Events:** 77 | {existing_events} 78 | 79 | 82 | 83 | **Gaps:** 84 | """ 85 | 86 | 87 | structure_events_prompt = """You are a data processing specialist. Your sole task is to convert a pre-cleaned, chronologically ordered list of life events into a structured JSON object. 88 | 89 | 90 | You will be given a list of events that is already de-duplicated and ordered. You must not change the order or content of the events. For each event in the list, you will extract its name, a detailed description, its date, and location, and format it as JSON. 91 | 92 | 93 | 94 | 1. For the `name` field, create a short, descriptive title for the event (e.g., "Birth of Pablo Picasso"). 95 | 2. For the `description` field, provide the clear and concise summary of what happened from the input text. 96 | 3. For the `date` field, populate `year`, `month`, and `day` whenever possible. 97 | 4. If the date is an estimate or a range (e.g., "circa 1912" or "Between 1920-1924"), you MUST capture that specific text in the `note` field of the date object, and provide your best estimate for the `year`. 98 | 5. For the `location` field, populate the location of the event, leave blank if not mentioned 99 | 100 | 101 | 102 | ---- 103 | {existing_events} 104 | ---- 105 | 106 | 107 | CRITICAL: You must only return the structured JSON output. Do not add any commentary, greetings, or explanations before or after the JSON. 108 | """ 109 | -------------------------------------------------------------------------------- /src/test/test_url_crawler.py: -------------------------------------------------------------------------------- 1 | # tests/url_crawler/test_url_crawler.py 2 | 3 | """Tests for the url_krawler_graph.""" 4 | 5 | from unittest.mock import AsyncMock, patch 6 | 7 | import pytest 8 | 9 | # Imports are relative to the src directory (configured in pyproject.toml pythonpath) 10 | from url_crawler.url_krawler_graph import url_crawler_app 11 | 12 | 13 | @pytest.fixture 14 | def sample_input_state() -> dict: 15 | """Provide a sample input state for the url_crawler_app graph.""" 16 | return { 17 | "url": "https://www.britannica.com/biography/Henry-Miller", 18 | "research_question": "Research the life of Henry Miller", 19 | } 20 | 21 | 22 | @pytest.fixture 23 | def mock_scraped_content(): 24 | """Provide mock scraped content for testing.""" 25 | return """ 26 | Henry Miller was an American novelist, short story writer and essayist. 27 | He was born in Yorkville, NYC on December 26, 1891. 28 | He moved to Paris in 1930 where he lived for many years. 29 | He wrote Tropic of Cancer, part of his series of novels about his life. 30 | He married his first wife Beatrice in 1917. 31 | He had a daughter named Barbara in 1919. 32 | He divorced Beatrice in 1924. 33 | He married his second wife June in 1924. 34 | He died in Pacific Palisades, California on June 7, 1980. 35 | """ 36 | 37 | 38 | @pytest.mark.asyncio 39 | async def test_url_crawler_with_mocked_llm( 40 | sample_input_state: dict, 41 | mock_scraped_content: str, 42 | ): 43 | """Unit test for the simplified URL crawler graph.""" 44 | # --- Act: Execute the graph with patched dependencies --- 45 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl: 46 | # Configure URL crawling mock 47 | mock_crawl.return_value = mock_scraped_content 48 | 49 | result = await url_crawler_app.ainvoke(sample_input_state) 50 | 51 | # --- Assert: Verify the output --- 52 | assert "extracted_events" in result 53 | assert "raw_scraped_content" in result 54 | 55 | extracted_events = result["extracted_events"] 56 | raw_scraped_content = result["raw_scraped_content"] 57 | 58 | # Verify that the scraped content is returned 59 | assert extracted_events == mock_scraped_content 60 | assert raw_scraped_content == mock_scraped_content 61 | 62 | # Verify that url_crawl was called with the correct URL 63 | mock_crawl.assert_called_once_with(sample_input_state["url"]) 64 | 65 | 66 | @pytest.mark.asyncio 67 | async def test_url_crawler_with_mocked_url_crawling( 68 | sample_input_state: dict, 69 | mock_scraped_content: str, 70 | ): 71 | """Test URL crawler with mocked URL crawling.""" 72 | # --- Act: Execute with mocked URL crawling --- 73 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl: 74 | # Configure URL crawling mock 75 | mock_crawl.return_value = mock_scraped_content 76 | 77 | result = await url_crawler_app.ainvoke(sample_input_state) 78 | 79 | # --- Assert: Verify the output --- 80 | assert "extracted_events" in result 81 | assert "raw_scraped_content" in result 82 | 83 | extracted_events = result["extracted_events"] 84 | raw_scraped_content = result["raw_scraped_content"] 85 | 86 | # Verify that the scraped content is returned correctly 87 | assert extracted_events == mock_scraped_content 88 | assert raw_scraped_content == mock_scraped_content 89 | 90 | # Verify that url_crawl was called with the correct URL 91 | mock_crawl.assert_called_once_with(sample_input_state["url"]) 92 | 93 | 94 | @pytest.mark.asyncio 95 | async def test_url_crawler_with_empty_content(): 96 | """Test URL crawler with empty scraped content.""" 97 | input_state = { 98 | "url": "https://example.com/empty", 99 | "research_question": "Test question", 100 | } 101 | 102 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl: 103 | # Configure URL crawling mock to return empty content 104 | mock_crawl.return_value = "" 105 | 106 | result = await url_crawler_app.ainvoke(input_state) 107 | 108 | # --- Assert: Verify the output --- 109 | assert "extracted_events" in result 110 | assert "raw_scraped_content" in result 111 | 112 | # Should return empty content 113 | assert result["extracted_events"] == "" 114 | assert result["raw_scraped_content"] == "" 115 | 116 | 117 | @pytest.mark.asyncio 118 | async def test_url_crawler_with_long_content(): 119 | """Test URL crawler with content longer than MAX_CONTENT_LENGTH.""" 120 | input_state = { 121 | "url": "https://example.com/long", 122 | "research_question": "Test question", 123 | } 124 | 125 | # Create content longer than typical MAX_CONTENT_LENGTH 126 | long_content = "This is a very long content. " * 10000 # Much longer than limit 127 | 128 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl: 129 | # Configure URL crawling mock to return long content 130 | mock_crawl.return_value = long_content 131 | 132 | result = await url_crawler_app.ainvoke(input_state) 133 | 134 | # --- Assert: Verify the output --- 135 | assert "extracted_events" in result 136 | assert "raw_scraped_content" in result 137 | 138 | # Content should be truncated to MAX_CONTENT_LENGTH 139 | returned_content = result["extracted_events"] 140 | assert len(returned_content) <= len(long_content) 141 | assert returned_content == result["raw_scraped_content"] -------------------------------------------------------------------------------- /src/test/test_merge_events.py: -------------------------------------------------------------------------------- 1 | # tests/research_events/test_merge_events.py 2 | 3 | """Tests for the merge_events_graph.""" 4 | 5 | from unittest.mock import AsyncMock, patch 6 | 7 | import pytest 8 | from src.state import CategoriesWithEvents 9 | 10 | # Imports are relative to the src directory (configured in pyproject.toml pythonpath) 11 | from research_events.merge_events import merge_events_graph 12 | from research_events.merge_events.merge_events_graph import merge_events_app 13 | 14 | # ## Refactoring Note: Import the module containing the object to be patched. 15 | # This makes the patch call cleaner and more robust against refactoring. 16 | 17 | 18 | @pytest.fixture 19 | def sample_input_state() -> dict: 20 | """Provide a sample input state for the merge_events_app graph.""" 21 | return { 22 | "existing_events": CategoriesWithEvents( 23 | early="Born in 1920 in Paris.", 24 | personal="Married in 1945.", 25 | career="Published first novel in 1950.", 26 | legacy="Won Nobel Prize in 1980.", 27 | ), 28 | "raw_extracted_events": "Born in 1920 in Paris, France. Started writing poetry at age 15. Moved to London in 1942.", 29 | } 30 | 31 | 32 | class MockResponse: 33 | """Mock response class for LLM responses.""" 34 | 35 | def __init__(self, content): 36 | """Initialize mock response with content.""" 37 | self.content = content 38 | 39 | 40 | @pytest.fixture 41 | def mock_structured_llm(): 42 | """Provide a reusable mock for model_for_structured with configurable responses.""" 43 | 44 | def create_mock_model(categorized_events, merge_responses): 45 | """Create a configured mock model.""" 46 | mock_model = AsyncMock() 47 | 48 | # Mock the structured output chain 49 | mock_structured_llm = AsyncMock() 50 | mock_structured_llm.ainvoke.return_value = categorized_events 51 | mock_model.with_structured_output.return_value = mock_structured_llm 52 | 53 | # Create async functions that return the mock responses 54 | async def mock_ainvoke(prompt): 55 | return merge_responses.pop(0) 56 | 57 | # Mock the regular invoke calls for merging 58 | mock_model.ainvoke = mock_ainvoke 59 | 60 | return mock_model 61 | 62 | return create_mock_model 63 | 64 | 65 | @pytest.mark.skip(reason="Skip mocked LLM test for now") 66 | @pytest.mark.asyncio 67 | async def test_merge_events_with_mocked_llm( 68 | sample_input_state: dict, mock_structured_llm 69 | ): 70 | """Unit test for the merge events graph with a mocked LLM.""" 71 | # --- Arrange: Mock Data Setup --- 72 | mock_categorized_events = CategoriesWithEvents( 73 | early="Born in 1920 in Paris, France. Started writing poetry at age 15.", 74 | personal="Moved to London in 1942.", 75 | career="", 76 | legacy="", 77 | ) 78 | 79 | mock_merge_responses = [ 80 | MockResponse( 81 | "Born in 1920 in Paris, France. Started writing poetry at age 15." 82 | ), 83 | MockResponse("Married in 1945. Moved to London in 1942."), 84 | MockResponse("Published first novel in 1950."), 85 | MockResponse("Won Nobel Prize in 1980."), 86 | ] 87 | 88 | # --- Act: Execute the graph with patched dependencies --- 89 | with patch.object(merge_events_graph, "model_for_structured") as mock_model: 90 | # Configure the mock using the reusable fixture 91 | llm_mock = mock_structured_llm(mock_categorized_events, mock_merge_responses) 92 | 93 | # Apply the mock configuration 94 | mock_model.ainvoke = llm_mock.ainvoke 95 | mock_model.with_structured_output.return_value = ( 96 | llm_mock.with_structured_output.return_value 97 | ) 98 | 99 | result = await merge_events_app.ainvoke(sample_input_state) 100 | 101 | # --- Assert: Verify the output --- 102 | assert "existing_events" in result 103 | merged_events = result["existing_events"] 104 | 105 | assert isinstance(merged_events, CategoriesWithEvents) 106 | assert ( 107 | merged_events.early 108 | == "Born in 1920 in Paris, France. Started writing poetry at age 15." 109 | ) 110 | assert merged_events.personal == "Married in 1945. Moved to London in 1942." 111 | assert merged_events.career == "Published first novel in 1950." 112 | assert merged_events.legacy == "Won Nobel Prize in 1980." 113 | 114 | 115 | @pytest.mark.skip(reason="Skip real LLM test for now") 116 | @pytest.mark.llm 117 | @pytest.mark.asyncio 118 | async def test_merge_events_with_real_llm(sample_input_state: dict): 119 | """Integration test for the merge events graph with real LLM calls.""" 120 | # --- Act --- 121 | result = await merge_events_app.ainvoke(sample_input_state) 122 | 123 | # --- Assert --- 124 | assert "existing_events" in result 125 | merged = result["existing_events"] 126 | assert isinstance(merged, CategoriesWithEvents) 127 | 128 | all_merged_text = " ".join(vars(merged).values()) 129 | # Check that key old and new info is present somewhere 130 | print("merged", merged) 131 | print(f"merged.early: {merged.early}") 132 | print(f"merged.personal: {merged.personal}") 133 | print(f"merged.career: {merged.career}") 134 | print(f"merged.legacy: {merged.legacy}") 135 | assert "1920" in merged.early 136 | assert "Married" in merged.personal 137 | assert "Nobel Prize" in merged.legacy 138 | assert "London" in merged.personal 139 | -------------------------------------------------------------------------------- /src/state.py: -------------------------------------------------------------------------------- 1 | """Defines the Pydantic models and TypedDicts for the research agent graph. 2 | This file serves as the schema for data structures, agent tools, and state management. 3 | """ 4 | 5 | import operator 6 | from typing import Annotated, List, TypedDict 7 | 8 | from langchain_core.messages import MessageLikeRepresentation 9 | from pydantic import BaseModel, Field 10 | 11 | ################################################################################ 12 | # Section 1: Core Data Models 13 | # - Defines the structure of the primary research output: the chronological timeline. 14 | ################################################################################ 15 | 16 | 17 | class ChronologyDate(BaseModel): 18 | """A structured representation of a date for a chronological event.""" 19 | 20 | year: int | None = Field(None, description="The year of the event.") 21 | note: str | None = Field( 22 | None, description="Adds extra information to the date (month, day, range...)." 23 | ) 24 | 25 | 26 | class ChronologyEventInput(BaseModel): 27 | """Represents a single event, typically used for initial data extraction before an ID is assigned.""" 28 | 29 | name: str = Field(description="A short, title-like name for the event.") 30 | description: str = Field(description="A concise description of the event.") 31 | date: ChronologyDate = Field(..., description="The structured date of the event.") 32 | location: str | None = Field( 33 | None, description="The geographical location where the event occurred." 34 | ) 35 | 36 | 37 | class ChronologyEvent(ChronologyEventInput): 38 | """The final, canonical event model with a unique identifier.""" 39 | 40 | id: str = Field( 41 | description="The id of the event in lowercase and underscores. Ex: 'word1_word2'" 42 | ) 43 | 44 | 45 | class ChronologyInput(BaseModel): 46 | """A list of newly extracted events from a research source.""" 47 | 48 | events: list[ChronologyEventInput] 49 | 50 | 51 | class Chronology(BaseModel): 52 | """A complete chronological timeline with finalized (ID'd) events.""" 53 | 54 | events: list[ChronologyEvent] 55 | 56 | 57 | class CategoriesWithEvents(BaseModel): 58 | early: str = Field( 59 | default="", 60 | description="Covers childhood, upbringing, family, education, and early influences that shaped the author.", 61 | ) 62 | personal: str = Field( 63 | default="", 64 | description="Focuses on relationships, friendships, family life, places of residence, and notable personal traits or beliefs.", 65 | ) 66 | career: str = Field( 67 | default="", 68 | description="Details their professional journey: first steps into writing, major publications, collaborations, recurring themes, style, and significant milestones.", 69 | ) 70 | legacy: str = Field( 71 | default="", 72 | description="Explains how their work was received, awards or recognition, cultural/literary impact, influence on other authors, and how they are remembered today.", 73 | ) 74 | 75 | 76 | ################################################################################ 77 | # Section 2: Agent Tools 78 | # - Pydantic models that define the tools available to the LLM agents. 79 | ################################################################################ 80 | 81 | 82 | class ResearchEventsTool(BaseModel): 83 | """The query to be used to research events about an historical figure. The query is based on the reflection of the assistant.""" 84 | 85 | research_question: str 86 | pass # No arguments needed 87 | 88 | 89 | class FinishResearchTool(BaseModel): 90 | """Concludes the research process. 91 | Call this tool ONLY when you have a comprehensive timeline of the person's life, 92 | including key events like birth, death, major achievements, and significant personal 93 | milestones, and you are confident that no major gaps remain. 94 | """ 95 | 96 | pass 97 | 98 | 99 | ################################################################################ 100 | # Section 3: Graph State Definitions 101 | # - TypedDicts and models that define the "memory" for the agent graphs. 102 | ################################################################################ 103 | 104 | 105 | def override_reducer(current_value, new_value): 106 | """Reducer function that allows a new value to completely replace the old one.""" 107 | if isinstance(new_value, dict) and new_value.get("type") == "override": 108 | return new_value.get("value", new_value) 109 | return operator.add(current_value, new_value) 110 | 111 | 112 | # --- Main Supervisor Graph State --- 113 | 114 | 115 | class SupervisorStateInput(TypedDict): 116 | """The initial input to start the main research graph.""" 117 | 118 | person_to_research: str 119 | existing_events: CategoriesWithEvents = Field( 120 | default=CategoriesWithEvents(early="", personal="", career="", legacy=""), 121 | description="Covers chronology events of the person to research.", 122 | ) 123 | used_domains: list[str] = Field( 124 | default=[], 125 | description="The domains that have been used to extract events.", 126 | ) 127 | events_summary: str = Field( 128 | default="", 129 | description="A summary of the events.", 130 | ) 131 | 132 | 133 | class SupervisorState(SupervisorStateInput): 134 | """The complete state for the main supervisor graph.""" 135 | 136 | final_events: List[ChronologyEvent] 137 | conversation_history: Annotated[list[MessageLikeRepresentation], override_reducer] 138 | iteration_count: int = 0 139 | structured_events: list[ChronologyEvent] | None 140 | -------------------------------------------------------------------------------- /src/research_events/merge_events/fullcategorized.json: -------------------------------------------------------------------------------- 1 | { 2 | "early": "- Henry Miller was born: Henry Valentine Miller was born on December 26 in New York City. (1891, New York City)\n- Family moved to Brooklyn: Miller's family moved to 1063 Decatur Street in Brooklyn's Bushwick neighborhood. (1900, Brooklyn, New York City)\n- Henry Miller was active with the Socialist Party of America: Miller was active with the Socialist Party of America. (1900 circa, New York)\n- Attended Eastern District High School: Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school. (early 1900s, Williamsburg, Brooklyn)\n- Attended City College of New York: Miller attended the City College of New York for one semester. (early 1910s, New York City)\n- Henry Miller married Beatrice Sylvas Wickens: Miller married Beatrice Sylvas Wickens, an amateur pianist. (1917, New York)\n- Daughter Barbara born: Miller and Beatrice had a daughter named Barbara. (1919, New York)\n- Worked at Western Union: Miller worked at Western Union as personnel manager in the messenger department. (1920-1924, New York City)\n- Wrote first novel Clipped Wings: Miller wrote his first novel, 'Clipped Wings,' during a three-week vacation in March (unpublished, only fragments remain). (1922 March, New York City)\n- Divorced Beatrice Sylvas Wickens: Miller was divorced from Beatrice Sylvas Wickens. (1923 December 21, New York)\n- Met June Mansfield: Miller met and became enamored of June Mansfield (born Juliet Edith Smerth) at a dance hall. (1923, New York)\n- Married June Mansfield: Miller married June Mansfield. (1924 June 1, New York)\n- Quit Western Union to write: Miller quit his job at Western Union to dedicate himself completely to writing. (1924, New York)", 3 | "personal": "- Spent months in Paris with June: Miller spent several months in Paris with June, trip financed by Roland Freedman. (1928, Paris, France)\n- Moved to Paris alone: Miller moved to Paris unaccompanied. (1930, Paris, France)\n- Divorced by June in Mexico: June divorced Miller by proxy. (1934, Mexico City, Mexico)\n- Visited Greece: Miller visited Greece, invited by Lawrence Durrell who was living in Corfu. (1939, Greece)\n- Returned to New York: Miller returned to New York. (1940, New York City)\n- Moved to California: Miller moved to California in June, initially residing just outside Hollywood in Beverly Glen. (1942 June, California)\n- Settled in Big Sur: Miller settled in Big Sur. (1944, Big Sur, California)\n- Married Janina Martha Lepska: Miller married Janina Martha Lepska. (1944, United States)\n- Lived in Big Sur with bohemian writers: Miller continued living in Big Sur with other bohemian writers. (from 1947, Big Sur, California)\n- Divorced Janina Martha Lepska: Miller was divorced from Janina Martha Lepska. (1952, United States)\n- Married artist Eve McClure: Miller married artist Eve McClure. (1953, United States)\n- Divorced Eve McClure: Miller was divorced from Eve McClure. (1960, United States)\n- Reunion with June in New York: Miller arranged a reunion with ex-wife June in New York. (1961, New York)\n- Moved to Pacific Palisades: Miller moved to 444 Ocampo Drive, Pacific Palisades, Los Angeles. (1963, Pacific Palisades, Los Angeles, California)\n- Married Hiroko Tokuda: Miller married Hiroko Tokuda. (1967, United States)\n- Divorced Hiroko Tokuda: Miller was divorced from Hiroko Tokuda. (1977, United States)\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)", 4 | "career": "- Wrote Moloch: Miller wrote 'Moloch: or, This Gentile World,' initially under the guise of a novel by June Mansfield (unpublished until 1992). (1927-1928, New York)\n- Proofreader for Chicago Tribune Paris: Miller was employed by the Chicago Tribune Paris edition as a proofreader. (1931, Paris, France)\n- Published Tropic of Cancer: Miller's first published book, 'Tropic of Cancer,' was published by Obelisk Press (banned in the United States). (1934, Paris, France)\n- Published Black Spring: Miller published 'Black Spring' (banned in the United States). (1936, Paris, France)\n- Published Tropic of Capricorn: Miller published 'Tropic of Capricorn' (banned in the United States). (1939, Paris, France)\n- Published The Colossus of Maroussi: Miller described his visit to Greece in 'The Colossus of Maroussi'. (1941, United States)\n- Began writing Sexus: Miller began writing 'Sexus,' the first novel in 'The Rosy Crucifixion' trilogy. (1942, California)\n- Published Sunday After the War: Miller published 'Sunday After the War'. (1944, United States)\n- Published The Air-Conditioned Nightmare: Miller published 'The Air-Conditioned Nightmare'. (1945, United States)\n- Published Big Sur and the Oranges of Hieronymus Bosch: Miller published 'Big Sur and the Oranges of Hieronymus Bosch'. (1957, United States)\n- Completed The Rosy Crucifixion trilogy: Miller completed 'The Rosy Crucifixion' trilogy (initially banned in the U.S., published in France and Japan). (1959, United States)\n- Tropic of Cancer published in the US: 'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials. (1961, United States)\n- Published On Turning Eighty: Miller published 'On Turning Eighty,' a chapbook with 200 copies. (1972, United States)\n- Nominated for Nobel Prize: Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip. (1973, )", 5 | "legacy": "- Published The Colossus of Maroussi: Miller described his visit to Greece in 'The Colossus of Maroussi'. (1941, United States)\n- Published Big Sur and the Oranges of Hieronymus Bosch: Miller published 'Big Sur and the Oranges of Hieronymus Bosch'. (1957, United States)\n- Tropic of Cancer published in the US: 'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials. (1961, United States)\n- Nominated for Nobel Prize: Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip. (1973, )\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)" 6 | } 7 | -------------------------------------------------------------------------------- /src/research_events/result.json: -------------------------------------------------------------------------------- 1 | { 2 | "existing_events": { 3 | "early": "- Henry Valentine Miller was born in New York City on December 26, 1891.\n- He lived at 450 East 85th Street in Manhattan during his early years.\n- His family moved to Williamsburg, Brooklyn when he was around nine years old, and later to Bushwick.\n- Miller attended Eastern District High School in Williamsburg.\n- He briefly studied at the City College of New York.\n- Miller became active with the Socialist Party of America.\n- He admired Hubert Harrison.", 4 | "personal": "- Miller married Beatrice Sylvas Wickens in 1917 and divorced her in 1923. They had a daughter, Barbara.\n- Miller met June Mansfield around 1924 and they married on June 1, 1924.\n- Miller lived with Kronski at some point between 1926-1927.\n- Miller moved to Paris in 1930. He spent several months there with June in 1938. During his ten-year stay in Paris, Miller became fluent in French.\n- Miller returned to New York in 1940 and moved to California in 1942, initially residing just outside Hollywood in Beverly Glen before settling in Big Sur in 1944.\n- Miller married Janina Martha Lepska in 1944 and had two children with her. They divorced in 1952.\n- Miller married Eve McClure in 1953 but they divorced in 1960.\n- Miller married Hiroko Tokuda in 1967 but they divorced in 1977.", 5 | "career": "- Miller quit Western Union to dedicate himself to writing in 1924.\n- He was supported financially by Roland Freedman who paid June Mansfield to write a novel, pretending it was her work and reviewing Miller's writing weekly.\n- Miller moved to Paris unaccompanied in 1930.\n- He was employed as a proofreader for the Chicago Tribune Paris edition in 1931 thanks to Alfred Perlès.\n- This period marked a creative time for Miller, and he began building a network of authors around Villa Seurat.\n- Lawrence Durrell became a lifelong friend.\n- Anaïs Nin and Hugh Guiler financially supported Miller between 1931-1934, covering his living expenses including rent at 18 Villa Seurat.\n- Nin became his lover and financed the first printing of Tropic of Cancer in 1934 with money from Otto Rank.", 6 | "legacy": "- Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip in 1973.\n- Miller participated in the filming of Reds in the late 1970s.\n- Miller held an ongoing correspondence of over 1,500 letters with Brenda Venus between 1978 and 1981.\n- Miller died on June 7, 1980 at his home in Pacific Palisades, Los Angeles, aged 88.\n- The Henry Miller Memorial Library was founded in Big Sur in 1981 by Emil White." 7 | }, 8 | "final_events": { 9 | "early": "- Henry Valentine Miller was born in New York City on December 26, 1891.\n- He lived at 450 East 85th Street in Manhattan during his early years.\n- His family moved to Williamsburg, Brooklyn when he was around nine years old, and later to Bushwick.\n- Miller attended Eastern District High School in Williamsburg.\n- He briefly studied at the City College of New York.\n- Miller became active with the Socialist Party of America.\n- He admired Hubert Harrison.\n- He was brought up in Brooklyn.", 10 | "personal": "- Miller married Beatrice Sylvas Wickens in 1917 and divorced her in 1923. They had a daughter, Barbara.\n- Miller met June Mansfield around 1924 and they married on June 1, 1924.\n- Miller left his job with Western Union in New York to devote himself to writing in 1924.\n- Miller lived with Kronski at some point between 1926-1927.\n- In 1930, Miller moved to Paris and visited Greece in 1939. During his ten-year stay in France, he became fluent in French.\n- Miller toured the United States extensively between 1940-41 before settling in Big Sur, California.\n- Miller returned to New York in 1940 and moved to California in 1942, initially residing just outside Hollywood in Beverly Glen before settling in Big Sur in 1944.\n- Miller married Janina Martha Lepska in 1944 and had two children with her. They divorced in 1952.\n- Miller married Eve McClure in 1953 but they divorced in 1960.\n- Miller married Hiroko Tokuda in 1967 but they divorced in 1977.", 11 | "career": "- Miller quit Western Union to dedicate himself to writing in 1924.\n- He was supported financially by Roland Freedman who paid June Mansfield to write a novel, pretending it was her work and reviewing Miller's writing weekly.\n- Miller moved to Paris unaccompanied in 1930.\n- He was employed as a proofreader for the Chicago Tribune Paris edition in 1931 thanks to Alfred Perlès.\n- This period marked a creative time for Miller, and he began building a network of authors around Villa Seurat.\n- Lawrence Durrell became a lifelong friend.\n- Anaïs Nin and Hugh Guiler financially supported Miller between 1931-1934, covering his living expenses including rent at 18 Villa Seurat.\n- Nin became his lover and financed the first printing of Tropic of Cancer in 1934 with money from Otto Rank.\n- Miller's career as a writer began when he left his job at Western Union to focus on writing full-time in 1924.\n- He spent significant time abroad for his work, first in France (1930), later Greece (1939).\n- His travels also included an extensive tour of the United States between 1940-41.", 12 | "legacy": "- Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip in 1973.\n- Miller participated in the filming of Reds in the late 1970s.\n- Miller held an ongoing correspondence of over 1,500 letters with Brenda Venus between 1978 and 1981.\n- Miller died on June 7, 1980 at his home in Pacific Palisades, Los Angeles, aged 88. He is remembered for his significant contributions to literature and his influence on other authors.\n- The Henry Miller Memorial Library was founded in Big Sur in 1981 by Emil White." 13 | }, 14 | "raw_extracted_events": "Here's a consolidated and chronological list of biographical events for Henry Miller, based on the provided text:\n\n* **December 26, 1891:** Henry Miller was born in New York City.\n* **Childhood:** Miller was brought up in Brooklyn.\n* **1924:** Miller left his job with Western Union in New York to devote himself to writing.\n* **1930:** Miller went to France.\n* **1939:** Miller visited Greece.\n* **1940–41:** Miller toured the United States extensively.\n* **Later Years:** Miller settled in Big Sur, California, becoming the center of a colony of admirers.\n* **June 7, 1980:** Henry Miller died in Pacific Palisades, California, at the age of 88." 15 | } 16 | -------------------------------------------------------------------------------- /src/test/test_research_events.py: -------------------------------------------------------------------------------- 1 | # tests/research_events/test_research_events.py 2 | 3 | """Tests for the research_events_graph.""" 4 | 5 | from unittest.mock import AsyncMock, patch 6 | 7 | import pytest 8 | from src.state import CategoriesWithEvents 9 | 10 | # Imports are relative to the src directory (configured in pyproject.toml pythonpath) 11 | from research_events.research_events_graph import research_events_app 12 | 13 | 14 | @pytest.fixture 15 | def sample_input_state() -> dict: 16 | """Provide a sample input state for the research_events_app graph.""" 17 | return { 18 | "research_question": "Research the life of Henry Miller", 19 | "existing_events": CategoriesWithEvents( 20 | early="Born in 1920 in Paris.", 21 | personal="Married in 1945.", 22 | career="Published first novel in 1950.", 23 | legacy="Won Nobel Prize in 1980.", 24 | ), 25 | "used_domains": [], 26 | } 27 | 28 | 29 | class MockResponse: 30 | """Mock response class for LLM responses.""" 31 | 32 | def __init__(self, content): 33 | """Initialize mock response with content.""" 34 | self.content = content 35 | 36 | 37 | class MockToolCall: 38 | """Mock tool call for structured LLM responses.""" 39 | 40 | def __init__(self, name, args): 41 | """Initialize mock tool call with name and args.""" 42 | self.name = name 43 | self.args = args 44 | 45 | 46 | class MockToolResponse: 47 | """Mock tool response for structured LLM responses.""" 48 | 49 | def __init__(self, tool_calls=None): 50 | """Initialize mock tool response with tool calls.""" 51 | self.tool_calls = tool_calls or [] 52 | 53 | 54 | @pytest.fixture 55 | def mock_url_crawler(): 56 | """Provide a reusable mock for url_crawler_app with configurable responses.""" 57 | 58 | def create_mock_crawler(extracted_events): 59 | """Create a configured mock crawler.""" 60 | mock_crawler = AsyncMock() 61 | mock_crawler.ainvoke.return_value = { 62 | "extracted_events": extracted_events, 63 | "raw_scraped_content": "Mock scraped content", 64 | } 65 | return mock_crawler 66 | 67 | return create_mock_crawler 68 | 69 | 70 | @pytest.fixture 71 | def mock_merge_events(): 72 | """Provide a reusable mock for merge_events_app with configurable responses.""" 73 | 74 | def create_mock_merger(existing_events): 75 | """Create a configured mock merger.""" 76 | mock_merger = AsyncMock() 77 | mock_merger.ainvoke.return_value = {"existing_events": existing_events} 78 | return mock_merger 79 | 80 | return create_mock_merger 81 | 82 | 83 | # @pytest.mark.skip(reason="Skip mocked LLM test for now") 84 | @pytest.mark.asyncio 85 | async def test_research_events_with_mocked_llm( 86 | sample_input_state: dict, mock_url_crawler, mock_merge_events 87 | ): 88 | """Unit test for the research events graph with mocked dependencies.""" 89 | # --- Arrange: Mock Data Setup --- 90 | mock_extracted_events = "Born in 1920 in Paris, France. Started writing poetry at age 15. Moved to London in 1942." 91 | mock_existing_events = CategoriesWithEvents( 92 | early="Born in 1920 in Paris, France. Started writing poetry at age 15.", 93 | personal="Married in 1945. Moved to London in 1942.", 94 | career="Published first novel in 1950.", 95 | legacy="Won Nobel Prize in 1980.", 96 | ) 97 | 98 | # --- Act: Execute the graph with patched dependencies --- 99 | with ( 100 | patch( 101 | "research_events.research_events_graph.url_crawler_app" 102 | ) as mock_crawler_patch, 103 | patch( 104 | "research_events.research_events_graph.merge_events_app" 105 | ) as mock_merger_patch, 106 | patch("research_events.research_events_graph.TavilySearch") as mock_tavily, 107 | patch("research_events.research_events_graph.create_structured_model") as mock_llm, 108 | ): 109 | # Configure the mocks 110 | mock_crawler_patch.ainvoke = mock_url_crawler(mock_extracted_events).ainvoke 111 | mock_merger_patch.ainvoke = mock_merge_events(mock_existing_events).ainvoke 112 | 113 | # Mock TavilySearch to return empty results (no URLs found) 114 | from unittest.mock import Mock 115 | 116 | mock_tavily_instance = Mock() 117 | mock_tavily_instance.invoke.return_value = {"results": []} 118 | mock_tavily.return_value = mock_tavily_instance 119 | 120 | # Mock the structured LLM to return a test URL 121 | mock_llm_instance = Mock() 122 | mock_llm_instance.invoke.return_value = Mock(selected_urls=["https://example.com/test"]) 123 | mock_llm.return_value = mock_llm_instance 124 | 125 | result = await research_events_app.ainvoke(sample_input_state) 126 | 127 | # --- Assert: Verify the output --- 128 | # The result structure should have existing_events and used_domains 129 | assert "existing_events" in result 130 | assert "used_domains" in result 131 | 132 | existing_events = result["existing_events"] 133 | used_domains = result["used_domains"] 134 | 135 | assert isinstance(existing_events, CategoriesWithEvents) 136 | assert ( 137 | existing_events.early 138 | == "Born in 1920 in Paris, France. Started writing poetry at age 15." 139 | ) 140 | assert existing_events.personal == "Married in 1945. Moved to London in 1942." 141 | assert existing_events.career == "Published first novel in 1950." 142 | assert existing_events.legacy == "Won Nobel Prize in 1980." 143 | 144 | # Verify that domains were tracked 145 | assert isinstance(used_domains, list) 146 | 147 | 148 | # @pytest.mark.skip(reason="Skip real LLM test for now") 149 | @pytest.mark.llm 150 | @pytest.mark.asyncio 151 | async def test_research_events_with_real_llm(sample_input_state: dict): 152 | """Integration test for the research events graph with real LLM calls.""" 153 | # --- Act --- 154 | result = await research_events_app.ainvoke(sample_input_state) 155 | 156 | # --- Assert --- 157 | # The result structure should have existing_events and used_domains 158 | assert "existing_events" in result 159 | assert "used_domains" in result 160 | 161 | existing_events = result["existing_events"] 162 | used_domains = result["used_domains"] 163 | assert isinstance(existing_events, CategoriesWithEvents) 164 | 165 | # Check that key information is present somewhere in the final events 166 | all_merged_text = " ".join(vars(existing_events).values()) 167 | 168 | # Verify that some content was extracted and merged 169 | assert len(all_merged_text) > 0 170 | # The final events should contain some information from the existing events 171 | assert ( 172 | "1920" in all_merged_text 173 | or "Married" in all_merged_text 174 | or "Nobel Prize" in all_merged_text 175 | ) 176 | 177 | # Verify that domains were tracked 178 | assert isinstance(used_domains, list) 179 | -------------------------------------------------------------------------------- /src/research_events/research_events_graph.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, TypedDict 2 | 3 | from langchain_tavily import TavilySearch 4 | from langgraph.graph import END, START, StateGraph 5 | from langgraph.graph.state import RunnableConfig 6 | from langgraph.types import Command 7 | from pydantic import BaseModel, Field 8 | from src.configuration import Configuration 9 | from src.llm_service import create_llm_structured_model 10 | from src.research_events.merge_events.merge_events_graph import merge_events_app 11 | from src.services.url_service import URLService 12 | from src.state import CategoriesWithEvents 13 | from src.url_crawler.url_krawler_graph import url_crawler_app 14 | from src.utils import get_langfuse_handler 15 | 16 | 17 | class InputResearchEventsState(TypedDict): 18 | research_question: str 19 | existing_events: CategoriesWithEvents 20 | used_domains: list[str] 21 | 22 | 23 | class ResearchEventsState(InputResearchEventsState): 24 | urls: list[str] 25 | # Add this temporary field 26 | extracted_events: str 27 | 28 | 29 | class OutputResearchEventsState(TypedDict): 30 | existing_events: CategoriesWithEvents 31 | used_domains: list[str] 32 | 33 | 34 | class BestUrls(BaseModel): 35 | selected_urls: list[str] = Field(description="A list of the two best URLs.") 36 | 37 | 38 | def url_finder( 39 | state: ResearchEventsState, 40 | config: RunnableConfig, 41 | ) -> Command[Literal["should_process_url_router"]]: 42 | """Find the urls for the research_question""" 43 | research_question = state.get("research_question", "") 44 | used_domains = state.get("used_domains", []) 45 | 46 | if not research_question: 47 | raise ValueError("research_question is required") 48 | 49 | tool = TavilySearch( 50 | max_results=6, 51 | topic="general", 52 | include_raw_content=False, 53 | include_answer=False, 54 | exclude_domains=used_domains, 55 | ) 56 | 57 | result = tool.invoke({"query": research_question}) 58 | 59 | urls = [result["url"] for result in result["results"]] 60 | 61 | prompt = """ 62 | From the results below, select the two URLs that will provide the most bibliographical events 63 | (key life events, publications, historical records, detailed timelines) about 64 | the subject's life in relation to the research question. 65 | 66 | 67 | {results} 68 | 69 | 70 | 71 | {research_question} 72 | 73 | 74 | """ 75 | 76 | prompt = prompt.format(results=urls, research_question=research_question) 77 | 78 | structured_llm = create_llm_structured_model(config=config, class_name=BestUrls) 79 | 80 | structured_result = structured_llm.invoke(prompt) 81 | 82 | # return Command( 83 | # goto=END, 84 | # update={ 85 | # "existing_events": CategoriesWithEvents( 86 | # early="test", personal="test", career="test", legacy="test" 87 | # ), 88 | # "used_domains": ["en.wikipedia.org", "www.britannica.com"], 89 | # }, 90 | # ) 91 | 92 | ### call to tavily/duck duck go 93 | # urls = model.invoke(research_question) 94 | # urls = [ 95 | # "https://en.wikipedia.org/wiki/Henry_Miller", 96 | # "https://www.britannica.com/biography/Henry-Miller", 97 | # ] 98 | 99 | return Command( 100 | goto="should_process_url_router", 101 | update={"urls": structured_result.selected_urls}, 102 | ) 103 | 104 | 105 | def updateUrlList( 106 | state: ResearchEventsState, 107 | ) -> tuple[list[str], list[str]]: 108 | urls = state.get("urls", []) 109 | used_domains = state.get("used_domains", []) 110 | 111 | return URLService.update_url_list(urls, used_domains) 112 | 113 | 114 | def should_process_url_router( 115 | state: ResearchEventsState, 116 | ) -> Command[Literal["crawl_url", "__end__"]]: 117 | urls = state.get("urls", []) 118 | used_domains = state.get("used_domains", []) 119 | 120 | if urls and len(urls) > 0: 121 | domain = URLService.extract_domain(urls[0]) 122 | if domain in used_domains: 123 | # remove first url 124 | remaining_urls = urls[1:] 125 | return Command( 126 | goto="should_process_url_router", 127 | update={"urls": remaining_urls, "used_domains": used_domains}, 128 | ) 129 | 130 | print(f"URLs remaining: {len(state['urls'])}. Routing to crawl.") 131 | return Command(goto="crawl_url") 132 | else: 133 | print("No URLs remaining. Routing to __end__.") 134 | # Otherwise, end the graph execution 135 | return Command( 136 | goto=END, 137 | ) 138 | 139 | 140 | async def crawl_url( 141 | state: ResearchEventsState, 142 | ) -> Command[Literal["merge_events_and_update"]]: 143 | """Crawls the next URL and updates the temporary state with new events.""" 144 | urls = state["urls"] 145 | url_to_process = urls[0] # Always process the first one 146 | research_question = state.get("research_question", "") 147 | 148 | if not research_question: 149 | raise ValueError("research_question is required for url crawling") 150 | 151 | # Invoke the crawler subgraph 152 | result = await url_crawler_app.ainvoke( 153 | {"url": url_to_process, "research_question": research_question} 154 | ) 155 | extracted_events = result["extracted_events"] 156 | # Go to the merge node, updating the state with the extracted events 157 | return Command( 158 | goto="merge_events_and_update", 159 | update={"extracted_events": extracted_events}, 160 | ) 161 | 162 | 163 | async def merge_events_and_update( 164 | state: ResearchEventsState, 165 | ) -> Command[Literal["should_process_url_router"]]: 166 | """Merges new events, removes the processed URL, and loops back to the router.""" 167 | existing_events = state.get("existing_events", CategoriesWithEvents()) 168 | extracted_events = state.get("extracted_events", "") 169 | research_question = state.get("research_question", "") 170 | 171 | # Invoke the merge subgraph 172 | result = await merge_events_app.ainvoke( 173 | { 174 | "existing_events": existing_events, 175 | "extracted_events": extracted_events, 176 | "research_question": research_question, 177 | } 178 | ) 179 | 180 | remaining_urls, used_domains = updateUrlList(state) 181 | 182 | # Remaining URLs after removal 183 | return Command( 184 | goto="should_process_url_router", 185 | update={ 186 | "existing_events": result["existing_events"], 187 | "urls": remaining_urls, 188 | "used_domains": used_domains, 189 | # "extracted_events": "", # Clear the temporary state 190 | }, 191 | ) 192 | 193 | 194 | research_events_builder = StateGraph( 195 | ResearchEventsState, 196 | input_schema=InputResearchEventsState, 197 | output_schema=OutputResearchEventsState, 198 | config_schema=Configuration, 199 | ) 200 | 201 | # Add all the nodes to the graph 202 | research_events_builder.add_node("url_finder", url_finder) 203 | research_events_builder.add_node("should_process_url_router", should_process_url_router) 204 | research_events_builder.add_node("crawl_url", crawl_url) 205 | research_events_builder.add_node("merge_events_and_update", merge_events_and_update) 206 | 207 | # Set the entry point 208 | research_events_builder.add_edge(START, "url_finder") 209 | 210 | 211 | research_events_app = research_events_builder.compile().with_config( 212 | {"callbacks": [get_langfuse_handler()]} 213 | ) 214 | -------------------------------------------------------------------------------- /src/graph.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | from langchain_core.messages import ( 4 | HumanMessage, 5 | SystemMessage, 6 | ToolMessage, 7 | ) 8 | from langchain_core.runnables import RunnableConfig 9 | from langgraph.graph import START, StateGraph 10 | from langgraph.types import Command 11 | from src.configuration import Configuration 12 | from src.llm_service import ( 13 | create_llm_structured_model, 14 | create_llm_with_tools, 15 | ) 16 | from src.prompts import ( 17 | events_summarizer_prompt, 18 | lead_researcher_prompt, 19 | structure_events_prompt, 20 | ) 21 | from src.research_events.research_events_graph import research_events_app 22 | from src.state import ( 23 | CategoriesWithEvents, 24 | Chronology, 25 | FinishResearchTool, 26 | ResearchEventsTool, 27 | SupervisorState, 28 | SupervisorStateInput, 29 | ) 30 | from src.utils import get_buffer_string_with_tools, get_langfuse_handler, think_tool 31 | 32 | config = Configuration() 33 | MAX_TOOL_CALL_ITERATIONS = config.max_tool_iterations 34 | 35 | 36 | # Verify connection 37 | # if langfuse.auth_check(): 38 | # print("Langfuse client is authenticated and ready!") 39 | # else: 40 | # print("Authentication failed. Please check your credentials and host.") 41 | 42 | 43 | async def supervisor_node( 44 | state: SupervisorState, 45 | config: RunnableConfig, 46 | ) -> Command[Literal["supervisor_tools"]]: 47 | """The 'brain' of the agent. It decides the next action.""" 48 | tools = [ 49 | ResearchEventsTool, 50 | FinishResearchTool, 51 | think_tool, 52 | ] 53 | 54 | tools_model = create_llm_with_tools(tools=tools, config=config) 55 | messages = state.get("conversation_history", "") 56 | messages_summary = get_buffer_string_with_tools(messages) 57 | last_message = "" 58 | if len(messages_summary) > 0: 59 | last_message = messages[-1] 60 | system_message = SystemMessage( 61 | content=lead_researcher_prompt.format( 62 | person_to_research=state["person_to_research"], 63 | events_summary=state.get("events_summary", "Everything is missing"), 64 | last_message=last_message, 65 | max_iterations=5, 66 | ) 67 | ) 68 | 69 | human_message = HumanMessage(content="Start the research process.") 70 | prompt = [system_message, human_message] 71 | 72 | response = await tools_model.ainvoke(prompt) 73 | 74 | # The output is an AIMessage with tool_calls, which we add to the history 75 | return Command( 76 | goto="supervisor_tools", 77 | update={ 78 | "conversation_history": [response], 79 | "iteration_count": state.get("iteration_count", 0) + 1, 80 | }, 81 | ) 82 | 83 | 84 | async def supervisor_tools_node( 85 | state: SupervisorState, 86 | config: RunnableConfig, 87 | ) -> Command[Literal["supervisor", "structure_events"]]: 88 | """The 'hands' of the agent. Executes tools and returns a Command for routing.""" 89 | existing_events = state.get( 90 | "existing_events", 91 | CategoriesWithEvents(early="", personal="", career="", legacy=""), 92 | ) 93 | events_summary = state.get("events_summary", "") 94 | used_domains = state.get("used_domains", []) 95 | last_message = state["conversation_history"][-1] 96 | iteration_count = state.get("iteration_count", 0) 97 | exceeded_allowed_iterations = iteration_count >= MAX_TOOL_CALL_ITERATIONS 98 | 99 | # If the LLM made no tool calls, we finish. 100 | if not last_message.tool_calls or exceeded_allowed_iterations: 101 | return Command(goto="structure_events") 102 | 103 | # This is the core logic for executing tools and updating state. 104 | all_tool_messages = [] 105 | 106 | for tool_call in last_message.tool_calls: 107 | tool_name = tool_call["name"] 108 | tool_args = tool_call["args"] 109 | 110 | if tool_name == "FinishResearchTool": 111 | return Command(goto="structure_events") 112 | 113 | elif tool_name == "think_tool": 114 | # The 'think' tool is special: it just records a reflection. 115 | # The reflection will be in the message history for the *next* supervisor turn. 116 | response_content = tool_args["reflection"] 117 | all_tool_messages.append( 118 | ToolMessage( 119 | content=response_content, 120 | tool_call_id=tool_call["id"], 121 | name=tool_name, 122 | ) 123 | ) 124 | 125 | elif tool_name == "ResearchEventsTool": 126 | research_question = tool_args["research_question"] 127 | result = await research_events_app.ainvoke( 128 | { 129 | "research_question": research_question, 130 | "existing_events": existing_events, 131 | "used_domains": used_domains, 132 | } 133 | ) 134 | existing_events = result["existing_events"] 135 | used_domains = result["used_domains"] 136 | 137 | summarizer_prompt = events_summarizer_prompt.format( 138 | existing_events=existing_events 139 | ) 140 | response = await create_llm_structured_model(config=config).ainvoke( 141 | summarizer_prompt 142 | ) 143 | 144 | existing_events = existing_events 145 | events_summary = response.content 146 | all_tool_messages.append( 147 | ToolMessage( 148 | content="Called ResearchEventsTool and returned multiple events", 149 | tool_call_id=tool_call["id"], 150 | name=tool_name, 151 | ) 152 | ) 153 | 154 | # The Command helper tells the graph where to go next and what state to update. 155 | return Command( 156 | goto="supervisor", 157 | update={ 158 | "existing_events": existing_events, 159 | "conversation_history": all_tool_messages, 160 | "used_domains": used_domains, 161 | "events_summary": events_summary, 162 | }, 163 | ) 164 | 165 | 166 | async def structure_events( 167 | state: SupervisorState, config: RunnableConfig 168 | ) -> Command[Literal["__end__"]]: 169 | """Step 2: Structures the cleaned events into JSON format. 170 | 171 | Args: 172 | state: Current researcher state with cleaned events text. 173 | config: Runtime configuration with model settings. 174 | 175 | Returns: 176 | Dictionary containing a list of structured chronology events. 177 | """ 178 | print("--- Step 2: Structuring Events into JSON ---") 179 | 180 | # Get the cleaned events from the previous step 181 | existing_events = state.get("existing_events", "") 182 | 183 | if not existing_events: 184 | print("Warning: No cleaned events text found in state") 185 | return {"chronology": []} 186 | 187 | structured_llm = create_llm_structured_model(config=config, class_name=Chronology) 188 | 189 | early_prompt = structure_events_prompt.format( 190 | existing_events=existing_events["early"] 191 | ) 192 | career_prompt = structure_events_prompt.format( 193 | existing_events=existing_events["career"] 194 | ) 195 | personal_prompt = structure_events_prompt.format( 196 | existing_events=existing_events["personal"] 197 | ) 198 | legacy_prompt = structure_events_prompt.format( 199 | existing_events=existing_events["legacy"] 200 | ) 201 | 202 | early_response = await structured_llm.ainvoke(early_prompt) 203 | career_response = await structured_llm.ainvoke(career_prompt) 204 | personal_response = await structured_llm.ainvoke(personal_prompt) 205 | legacy_response = await structured_llm.ainvoke(legacy_prompt) 206 | # Invoke the second model to get the final structured output 207 | 208 | all_events = ( 209 | early_response.events 210 | + career_response.events 211 | + personal_response.events 212 | + legacy_response.events 213 | ) 214 | 215 | return { 216 | "structured_events": all_events, 217 | } 218 | 219 | 220 | workflow = StateGraph(SupervisorState, input_schema=SupervisorStateInput) 221 | 222 | # Add the two core nodes 223 | workflow.add_node("supervisor", supervisor_node) 224 | workflow.add_node("supervisor_tools", supervisor_tools_node) 225 | workflow.add_node("structure_events", structure_events) 226 | 227 | workflow.add_edge(START, "supervisor") 228 | 229 | graph = workflow.compile().with_config({"callbacks": [get_langfuse_handler()]}) 230 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Contributors][contributors-shield]][contributors-url] 2 | [![Forks][forks-shield]][forks-url] 3 | [![Stargazers][stars-shield]][stars-url] 4 | [![Issues][issues-shield]][issues-url] 5 | [![Unlicense License][license-shield]][license-url] 6 | [![LinkedIn][linkedin-shield]][linkedin-url] 7 | 8 | # Event Deep Research 9 | 10 | AI Agent that researchs the lifes of historical figures and extracts the events into a structured JSON timeline. 11 | 12 | Event Deep Research 13 | 14 | ## Table of Contents 15 | 16 | - [Event Deep Research](#event-deep-research) 17 | - [Table of Contents](#table-of-contents) 18 | - [Features](#features) 19 | - [Demo / Example](#demo--example) 20 | - [🚀 Installation](#-installation) 21 | - [Prerequisites](#prerequisites) 22 | - [Setup](#setup) 23 | - [Usage](#usage) 24 | - [Via LangGraph Studio (Recommended)](#via-langgraph-studio-recommended) 25 | - [Configuration (configuration.py)](#configuration-configurationpy) 26 | - [Architecture / Internals](#architecture--internals) 27 | - [Roadmap / Future Work](#roadmap--future-work) 28 | - [Contributing](#contributing) 29 | - [License](#license) 30 | - [Acknowledgments](#acknowledgments) 31 | 32 | --- 33 | 34 | ## Features 35 | 36 | - Supervisor Agent with multiple tools (Research, think, Finish) 37 | - Merge Workflow to incorporate and deduplicate events from multiple sources 38 | - Support for OpenAI, Anthropic, Google, or Local models (Ollama) 39 | 40 | ## Demo / Example 41 | 42 | https://github.com/user-attachments/assets/ebda1625-fdf6-4f3b-a5d2-319d6db40ec2 43 | 44 | **Input:** 45 | 46 | ```json 47 | { 48 | "person_to_research": "Albert Einstein" 49 | } 50 | ``` 51 | 52 | **Output:** 53 | 54 | ```json 55 | { 56 | "structured_events": [ 57 | { 58 | "name": "Birth in Ulm", 59 | "description": "Albert Einstein was born in Ulm, Germany to Hermann and Pauline Einstein", 60 | "date": {"year": 1879, "note": "March 14"}, 61 | "location": "Ulm, German Empire", 62 | "id": "time-1879-03-14T00:00:00Z" 63 | }, 64 | { 65 | "name": "Zurich Polytechnic", 66 | "description": "Entered the Swiss Federal Polytechnic School in Zurich to study physics and mathematics", 67 | "date": {"year": 1896, "note": ""}, 68 | "location": "Zurich, Switzerland", 69 | "id": "time-1896-01-01T00:00:00Z" 70 | }, 71 | { 72 | "name": "Miracle Year Papers", 73 | "description": "Published four groundbreaking papers on photoelectric effect, Brownian motion, special relativity, and mass-energy equivalence", 74 | "date": {"year": 1905, "note": ""}, 75 | "location": "Bern, Switzerland", 76 | "id": "time-1905-01-01T00:00:00Z" 77 | }, 78 | { 79 | "name": "Nobel Prize in Physics", 80 | "description": "Awarded Nobel Prize for his discovery of the law of the photoelectric effect", 81 | "date": {"year": 1921, "note": ""}, 82 | "location": "Stockholm, Sweden", 83 | "id": "time-1921-01-01T00:00:00Z" 84 | }, 85 | { 86 | "name": "Death in Princeton", 87 | "description": "Albert Einstein died at Princeton Hospital after refusing surgery for an abdominal aortic aneurysm", 88 | "date": {"year": 1955, "note": "April 18"}, 89 | "location": "Princeton, New Jersey, USA", 90 | "id": "time-1955-04-18T00:00:00Z" 91 | } 92 | ] 93 | } 94 | ``` 95 | 96 | ## 🚀 Installation 97 | 98 | ### Prerequisites 99 | 100 | - **Python 3.12+** 101 | - **uv** (Python package manager) 102 | 103 | ### Setup 104 | 105 | ```bash 106 | # 1. Clone the repository 107 | git clone https://github.com/bernatsampera/event-deep-research.git 108 | cd event-deep-research 109 | 110 | # 2. Create virtual environment and install dependencies 111 | uv venv && source .venv/bin/activate 112 | uv sync 113 | 114 | # 3. Set up environment variables 115 | cp .env.example .env 116 | # Edit .env with your API keys: 117 | # FIRECRAWL_BASE_URL (https://api.firecrawl.com/v1) 118 | # - FIRECRAWL_API_KEY (required for production, optional for local testing) 119 | # - TAVILY_API_KEY (required) 120 | # - OPENAI_API_KEY, ANTHROPIC_API_KEY, or GOOGLE_API_KEY (Change model in configuration.py) 121 | 122 | # 4. Start the development server 123 | uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.12 langgraph dev --allow-blocking 124 | # Open http://localhost:2024 to access LangGraph Studio 125 | ``` 126 | 127 | ## Usage 128 | 129 | ### Via LangGraph Studio (Recommended) 130 | 131 | 1. Start the development server: `uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.12 langgraph dev --allow-blocking` 132 | 2. Open http://localhost:2024 133 | 3. Select the `supervisor` graph 134 | 4. Input your research query: 135 | ```json 136 | { 137 | "person_to_research": "Albert Einstein" 138 | } 139 | ``` 140 | 5. Watch the agent work in real-time! 141 | 142 | ## Configuration (configuration.py) 143 | 144 | llm_model: Primary LLM model to use for both structured output and tools 145 | 146 | # Optional overrides to change the models used for different parts of the workflow 147 | structured_llm_model: Override model for structured output 148 | tools_llm_model: Override model for tools 149 | chunk_llm_model: Small model for chunk biographical event detection 150 | 151 | # Maximum tokens for the models 152 | structured_llm_max_tokens: Maximum tokens for structured output model 153 | tools_llm_max_tokens: Maximum tokens for tools model 154 | 155 | # Maximum retry attempts for the models 156 | max_structured_output_retries: Maximum retry attempts for structured output 157 | max_tools_output_retries: Maximum retry attempts for tool calls 158 | 159 | # Values from graph files 160 | default_chunk_size: Default chunk size for text processing 161 | default_overlap_size: Default overlap size between chunks 162 | max_content_length: Maximum content length to process 163 | max_tool_iterations: Maximum number of tool iterations 164 | max_chunks: Maximum number of chunks to process for biographical event detection 165 | 166 | ## Architecture / Internals 167 | 168 | 1. **Supervisor Agent** - Coordinates the entire workflow, decides next steps 169 | 2. **Research Agent** - Finds relevant biographical sources, manages crawler and merge agents 170 | 3. **URL Crawler** - Extracts content from web pages with Firecrawl 171 | 4. **Merge Agent** - Combines and deduplicates events 172 | 173 | Agent Graph 174 | 175 | ## Roadmap / Future Work 176 | 177 | - Add images to relevant events 178 | - Improve speed of merge graph 179 | 180 | ## Contributing 181 | 182 | We welcome contributions! This is a great project to learn: 183 | 184 | 1. **Fork** the repository 185 | 2. **Create** a feature branch: `git checkout -b feature/amazing-feature` 186 | 3. **Commit** your changes: `git commit -m 'Add amazing feature'` 187 | 4. **Push** to the branch: `git push origin feature/amazing-feature` 188 | 5. **Open** a Pull Request 189 | 190 | See the [open issues](https://github.com/bernatsampera/event-deep-research/issues) for a full list of proposed features and known issues. 191 | 192 | ## License 193 | 194 | Distributed under the MIT License. See `LICENSE.txt` for details. 195 | 196 | ## Acknowledgments 197 | 198 | - **[LangChain](https://github.com/langchain-ai/langchain)** - Foundational LLM framework 199 | - **[LangGraph](https://github.com/langchain-ai/langgraph)** - Multi-agent orchestration 200 | - **[Open Deep Research](https://github.com/langchain-ai/open_deep_research)** - Research methodology inspiration 201 | - **[Firecrawl](https://www.firecrawl.com/)** - Web scraping 202 | - **[Tavily](https://tavily.ai/)** - Web search 203 | 204 | [contributors-shield]: https://img.shields.io/github/contributors/bernatsampera/event-deep-research.svg?style=for-the-badge 205 | [contributors-url]: https://github.com/bernatsampera/event-deep-research/graphs/contributors 206 | [forks-shield]: https://img.shields.io/github/forks/bernatsampera/event-deep-research.svg?style=for-the-badge 207 | [forks-url]: https://github.com/bernatsampera/event-deep-research/network/members 208 | [stars-shield]: https://img.shields.io/github/stars/bernatsampera/event-deep-research.svg?style=for-the-badge 209 | [stars-url]: https://github.com/bernatsampera/event-deep-research/stargazers 210 | [issues-shield]: https://img.shields.io/github/issues/bernatsampera/event-deep-research.svg?style=for-the-badge 211 | [issues-url]: https://github.com/bernatsampera/event-deep-research/issues 212 | [license-shield]: https://img.shields.io/github/license/bernatsampera/event-deep-research.svg?style=for-the-badge 213 | [license-url]: https://github.com/bernatsampera/event-deep-research/blob/master/LICENSE.txt 214 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555 215 | [linkedin-url]: https://www.linkedin.com/in/bernat-sampera-195152107/ 216 | -------------------------------------------------------------------------------- /src/research_events/merge_events/merge_events_graph.py: -------------------------------------------------------------------------------- 1 | from typing import List, Literal, TypedDict 2 | 3 | from langchain_core.tools import tool 4 | from langgraph.graph import START, StateGraph 5 | from langgraph.graph.state import Command, RunnableConfig 6 | from langgraph.pregel.main import asyncio 7 | from pydantic import BaseModel, Field 8 | from src.configuration import Configuration 9 | from src.llm_service import create_llm_with_tools 10 | from src.research_events.chunk_graph import create_biographic_event_graph 11 | from src.research_events.merge_events.prompts import ( 12 | EXTRACT_AND_CATEGORIZE_PROMPT, 13 | MERGE_EVENTS_TEMPLATE, 14 | ) 15 | from src.research_events.merge_events.utils import ensure_categories_with_events 16 | from src.services.event_service import EventService 17 | from src.state import CategoriesWithEvents 18 | from src.url_crawler.utils import chunk_text_by_tokens 19 | from src.utils import get_langfuse_handler 20 | 21 | 22 | class RelevantEventsCategorized(BaseModel): 23 | """The chunk contains relevant biographical events that have been categorized.""" 24 | 25 | early: str = Field( 26 | description="Bullet points of events related to childhood, upbringing, family, education, and early influences" 27 | ) 28 | personal: str = Field( 29 | description="Bullet points of events related to relationships, friendships, family life, residence, and personal traits" 30 | ) 31 | career: str = Field( 32 | description="Bullet points of events related to professional journey, publications, collaborations, and milestones" 33 | ) 34 | legacy: str = Field( 35 | description="Bullet points of events related to recognition, impact, influence, and how they are remembered" 36 | ) 37 | 38 | 39 | class IrrelevantChunk(BaseModel): 40 | """The chunk contains NO biographical events relevant to the research question.""" 41 | 42 | 43 | class InputMergeEventsState(TypedDict): 44 | """The complete state for the enhanced event merging sub-graph.""" 45 | 46 | existing_events: CategoriesWithEvents 47 | extracted_events: str 48 | research_question: str 49 | 50 | 51 | class MergeEventsState(InputMergeEventsState): 52 | text_chunks: List[str] # token-based chunks 53 | categorized_chunks: List[CategoriesWithEvents] # results per chunk 54 | extracted_events_categorized: CategoriesWithEvents 55 | 56 | 57 | class OutputMergeEventsState(TypedDict): 58 | existing_events: CategoriesWithEvents # includes the existing events + the events from the new events 59 | 60 | 61 | async def split_events( 62 | state: MergeEventsState, 63 | ) -> Command[Literal["filter_chunks", "__end__"]]: 64 | """Use token-based chunking from URL crawler and filter for biographical events""" 65 | extracted_events = state.get("extracted_events", "") 66 | 67 | if not extracted_events.strip(): 68 | # No content to process 69 | return Command( 70 | goto="__end__", 71 | update={"text_chunks": [], "categorized_chunks": []}, 72 | ) 73 | 74 | chunks = await chunk_text_by_tokens(extracted_events) 75 | 76 | return Command( 77 | goto="filter_chunks", 78 | update={"text_chunks": chunks[0:20], "categorized_chunks": []}, 79 | ) 80 | 81 | 82 | async def filter_chunks( 83 | state: MergeEventsState, config: RunnableConfig 84 | ) -> Command[Literal["extract_and_categorize_chunk", "__end__"]]: 85 | """Filter chunks to only process those containing biographical events""" 86 | chunks = state.get("text_chunks", []) 87 | 88 | if not chunks: 89 | return Command( 90 | goto="__end__", 91 | ) 92 | 93 | # Use chunk graph to filter for biographical events 94 | chunk_graph = create_biographic_event_graph() 95 | 96 | configurable = Configuration.from_runnable_config(config) 97 | if len(chunks) > configurable.max_chunks: 98 | # To avoid recursion issues, set max chunks 99 | chunks = chunks[: configurable.max_chunks] 100 | 101 | # Process each chunk through the biographic event detection graph 102 | relevant_chunks = [] 103 | for chunk in chunks: 104 | chunk_result = await chunk_graph.ainvoke({"text": chunk}, config) 105 | 106 | # Check if any chunk contains biographical events 107 | has_events = any( 108 | result.contains_biographic_event 109 | for result in chunk_result["results"].values() 110 | ) 111 | print(f"contains_biographic_event: {has_events}") 112 | 113 | if has_events: 114 | relevant_chunks.append(chunk) 115 | 116 | if not relevant_chunks: 117 | # No relevant chunks found 118 | return Command(goto="__end__") 119 | 120 | return Command( 121 | goto="extract_and_categorize_chunk", 122 | update={"text_chunks": chunks, "categorized_chunks": []}, 123 | ) 124 | 125 | 126 | async def extract_and_categorize_chunk( 127 | state: MergeEventsState, config: RunnableConfig 128 | ) -> Command[Literal["extract_and_categorize_chunk", "merge_categorizations"]]: 129 | """Combined extraction and categorization""" 130 | chunks = state.get("text_chunks", []) 131 | categorized_chunks = state.get("categorized_chunks", []) 132 | 133 | if len(categorized_chunks) >= len(chunks): 134 | # all categorized_chunks done → move to merge 135 | return Command(goto="merge_categorizations") 136 | 137 | # take next chunk 138 | chunk = chunks[len(categorized_chunks)] 139 | research_question = state.get("research_question", "") 140 | 141 | prompt = EXTRACT_AND_CATEGORIZE_PROMPT.format( 142 | # research_question=research_question, 143 | text_chunk=chunk 144 | ) 145 | 146 | tools = [tool(RelevantEventsCategorized), tool(IrrelevantChunk)] 147 | model = create_llm_with_tools(tools=tools, config=config) 148 | response = await model.ainvoke(prompt) 149 | 150 | # Parse response 151 | if ( 152 | response.tool_calls 153 | and response.tool_calls[0]["name"] == "RelevantEventsCategorized" 154 | ): 155 | categorized_data = response.tool_calls[0]["args"] 156 | # Convert any list values to strings 157 | categorized_data = { 158 | k: "\n".join(v) if isinstance(v, list) else v 159 | for k, v in categorized_data.items() 160 | } 161 | categorized = CategoriesWithEvents(**categorized_data) 162 | else: 163 | categorized = CategoriesWithEvents(early="", personal="", career="", legacy="") 164 | 165 | return Command( 166 | goto="extract_and_categorize_chunk", # loop until all chunks processed 167 | update={"categorized_chunks": categorized_chunks + [categorized]}, 168 | ) 169 | 170 | 171 | async def merge_categorizations( 172 | state: MergeEventsState, 173 | ) -> Command[Literal["combine_new_and_original_events"]]: 174 | """Merge all categorized chunks into a single CategoriesWithEvents""" 175 | results = state.get("categorized_chunks", []) 176 | 177 | merged = EventService.merge_categorized_events(results) 178 | 179 | return Command( 180 | goto="combine_new_and_original_events", 181 | update={"extracted_events_categorized": merged}, 182 | ) 183 | 184 | 185 | async def combine_new_and_original_events( 186 | state: MergeEventsState, config: RunnableConfig 187 | ) -> Command: 188 | """Merge original and new events for each category using an LLM.""" 189 | print("Combining new and original events...") 190 | 191 | existing_events_raw = state.get( 192 | "existing_events", 193 | CategoriesWithEvents(early="", personal="", career="", legacy=""), 194 | ) 195 | new_events_raw = state.get( 196 | "extracted_events_categorized", 197 | CategoriesWithEvents(early="", personal="", career="", legacy=""), 198 | ) 199 | 200 | # Convert to proper Pydantic models if they're dicts 201 | existing_events = ensure_categories_with_events(existing_events_raw) 202 | new_events = ensure_categories_with_events(new_events_raw) 203 | 204 | if not new_events or not any( 205 | getattr(new_events, cat, "").strip() 206 | for cat in CategoriesWithEvents.model_fields.keys() 207 | ): 208 | print("No new events found. Keeping existing events.") 209 | return Command(goto="__end__", update={"existing_events": existing_events}) 210 | 211 | merge_tasks = [] 212 | categories = CategoriesWithEvents.model_fields.keys() 213 | 214 | for category in categories: 215 | # Now you can safely use getattr since they're guaranteed to be Pydantic models 216 | existing_text = getattr(existing_events, category, "").strip() 217 | new_text = getattr(new_events, category, "").strip() 218 | 219 | if not (existing_text or new_text): 220 | continue # nothing to merge in this category 221 | 222 | existing_display = existing_text if existing_text else "No events" 223 | new_display = new_text if new_text else "No events" 224 | 225 | prompt = MERGE_EVENTS_TEMPLATE.format( 226 | original=existing_display, new=new_display 227 | ) 228 | 229 | # Use regular structured model for merging (not tools model) 230 | from src.llm_service import create_llm_structured_model 231 | 232 | merge_tasks.append( 233 | (category, create_llm_structured_model(config=config).ainvoke(prompt)) 234 | ) 235 | 236 | final_merged_dict = {} 237 | if merge_tasks: 238 | categories, tasks = zip(*merge_tasks) 239 | responses = await asyncio.gather(*tasks) 240 | final_merged_dict = { 241 | cat: resp.content for cat, resp in zip(categories, responses) 242 | } 243 | 244 | # Ensure all categories are included 245 | for category in CategoriesWithEvents.model_fields.keys(): 246 | if category not in final_merged_dict: 247 | final_merged_dict[category] = getattr(existing_events, category, "") 248 | 249 | final_merged_output = CategoriesWithEvents(**final_merged_dict) 250 | return Command(goto="__end__", update={"existing_events": final_merged_output}) 251 | 252 | 253 | merge_events_graph_builder = StateGraph( 254 | MergeEventsState, input_schema=InputMergeEventsState, config_schema=Configuration 255 | ) 256 | 257 | merge_events_graph_builder.add_node("split_events", split_events) 258 | merge_events_graph_builder.add_node("filter_chunks", filter_chunks) 259 | merge_events_graph_builder.add_node( 260 | "extract_and_categorize_chunk", extract_and_categorize_chunk 261 | ) 262 | merge_events_graph_builder.add_node("merge_categorizations", merge_categorizations) 263 | merge_events_graph_builder.add_node( 264 | "combine_new_and_original_events", combine_new_and_original_events 265 | ) 266 | 267 | merge_events_graph_builder.add_edge(START, "split_events") 268 | 269 | 270 | merge_events_app = merge_events_graph_builder.compile().with_config( 271 | { 272 | "callbacks": [get_langfuse_handler()], 273 | "recursionLimit": 200, 274 | }, 275 | ) 276 | -------------------------------------------------------------------------------- /src/research_events/merge_events/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "existing_events": [ 3 | { 4 | "name": "Henry Miller was born", 5 | "description": "Henry Valentine Miller was born on December 26 in New York City.", 6 | "date": { 7 | "year": 1891, 8 | "note": "December 26" 9 | }, 10 | "location": "New York City", 11 | "id": "henry_miller_born" 12 | }, 13 | { 14 | "name": "Family moved to Brooklyn", 15 | "description": "Miller's family moved to 1063 Decatur Street in Brooklyn's Bushwick neighborhood.", 16 | "date": { 17 | "year": 1900, 18 | "note": "" 19 | }, 20 | "location": "Brooklyn, New York City", 21 | "id": "henry_miller_family_moved_to_brooklyn" 22 | }, 23 | { 24 | "name": "Henry Miller was active with the Socialist Party of America", 25 | "description": "Miller was active with the Socialist Party of America.", 26 | "date": { 27 | "year": 1900, 28 | "note": "circa" 29 | }, 30 | "location": "New York", 31 | "id": "henry_miller_active_with_the_socialist_party_of_america" 32 | }, 33 | { 34 | "name": "Attended Eastern District High School", 35 | "description": "Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school.", 36 | "date": { 37 | "year": 1900, 38 | "note": "early 1900s" 39 | }, 40 | "location": "Williamsburg, Brooklyn", 41 | "id": "henry_miller_attended_high_school" 42 | }, 43 | { 44 | "name": "Attended City College of New York", 45 | "description": "Miller attended the City College of New York for one semester.", 46 | "date": { 47 | "year": 1910, 48 | "note": "early 1910s" 49 | }, 50 | "location": "New York City", 51 | "id": "henry_miller_attended_city_college" 52 | }, 53 | { 54 | "name": "Henry Miller married Beatrice Sylvas Wickens", 55 | "description": "Miller married Beatrice Sylvas Wickens, an amateur pianist.", 56 | "date": { 57 | "year": 1917, 58 | "note": "" 59 | }, 60 | "location": "New York", 61 | "id": "henry_miller_married_beatrice_sylvas_wickens" 62 | }, 63 | { 64 | "name": "Daughter Barbara born", 65 | "description": "Miller and Beatrice had a daughter named Barbara.", 66 | "date": { 67 | "year": 1919, 68 | "note": "" 69 | }, 70 | "location": "New York", 71 | "id": "henry_miller_daughter_barbara_born" 72 | }, 73 | { 74 | "name": "Worked at Western Union", 75 | "description": "Miller worked at Western Union as personnel manager in the messenger department.", 76 | "date": { 77 | "year": 1920, 78 | "note": "1920-1924" 79 | }, 80 | "location": "New York City", 81 | "id": "henry_miller_worked_at_western_union" 82 | }, 83 | { 84 | "name": "Wrote first novel Clipped Wings", 85 | "description": "Miller wrote his first novel, 'Clipped Wings,' during a three-week vacation in March (unpublished, only fragments remain).", 86 | "date": { 87 | "year": 1922, 88 | "note": "March" 89 | }, 90 | "location": "New York City", 91 | "id": "henry_miller_wrote_clipped_wings" 92 | }, 93 | { 94 | "name": "Divorced Beatrice Sylvas Wickens", 95 | "description": "Miller was divorced from Beatrice Sylvas Wickens.", 96 | "date": { 97 | "year": 1923, 98 | "note": "December 21" 99 | }, 100 | "location": "New York", 101 | "id": "henry_miller_divorced_beatrice" 102 | }, 103 | { 104 | "name": "Met June Mansfield", 105 | "description": "Miller met and became enamored of June Mansfield (born Juliet Edith Smerth) at a dance hall.", 106 | "date": { 107 | "year": 1923, 108 | "note": "" 109 | }, 110 | "location": "New York", 111 | "id": "henry_miller_met_june_mansfield" 112 | }, 113 | { 114 | "name": "Married June Mansfield", 115 | "description": "Miller married June Mansfield.", 116 | "date": { 117 | "year": 1924, 118 | "note": "June 1" 119 | }, 120 | "location": "New York", 121 | "id": "henry_miller_married_june_mansfield" 122 | }, 123 | { 124 | "name": "Quit Western Union to write", 125 | "description": "Miller quit his job at Western Union to dedicate himself completely to writing.", 126 | "date": { 127 | "year": 1924, 128 | "note": "" 129 | }, 130 | "location": "New York", 131 | "id": "henry_miller_quit_western_union" 132 | }, 133 | { 134 | "name": "Wrote Moloch", 135 | "description": "Miller wrote 'Moloch: or, This Gentile World,' initially under the guise of a novel by June Mansfield (unpublished until 1992).", 136 | "date": { 137 | "year": 1927, 138 | "note": "1927-1928" 139 | }, 140 | "location": "New York", 141 | "id": "henry_miller_wrote_moloch" 142 | }, 143 | { 144 | "name": "Spent months in Paris with June", 145 | "description": "Miller spent several months in Paris with June, trip financed by Roland Freedman.", 146 | "date": { 147 | "year": 1928, 148 | "note": "" 149 | }, 150 | "location": "Paris, France", 151 | "id": "henry_miller_paris_with_june_1928" 152 | }, 153 | { 154 | "name": "Moved to Paris alone", 155 | "description": "Miller moved to Paris unaccompanied.", 156 | "date": { 157 | "year": 1930, 158 | "note": "" 159 | }, 160 | "location": "Paris, France", 161 | "id": "henry_miller_moved_to_paris_alone" 162 | }, 163 | { 164 | "name": "Proofreader for Chicago Tribune Paris", 165 | "description": "Miller was employed by the Chicago Tribune Paris edition as a proofreader.", 166 | "date": { 167 | "year": 1931, 168 | "note": "" 169 | }, 170 | "location": "Paris, France", 171 | "id": "henry_miller_proofreader_chicago_tribune" 172 | }, 173 | { 174 | "name": "Published Tropic of Cancer", 175 | "description": "Miller's first published book, 'Tropic of Cancer,' was published by Obelisk Press (banned in the United States).", 176 | "date": { 177 | "year": 1934, 178 | "note": "" 179 | }, 180 | "location": "Paris, France", 181 | "id": "henry_miller_published_tropic_of_cancer" 182 | }, 183 | { 184 | "name": "Divorced by June in Mexico", 185 | "description": "June divorced Miller by proxy.", 186 | "date": { 187 | "year": 1934, 188 | "note": "" 189 | }, 190 | "location": "Mexico City, Mexico", 191 | "id": "henry_miller_divorced_by_june" 192 | }, 193 | { 194 | "name": "Published Black Spring", 195 | "description": "Miller published 'Black Spring' (banned in the United States).", 196 | "date": { 197 | "year": 1936, 198 | "note": "" 199 | }, 200 | "location": "Paris, France", 201 | "id": "henry_miller_published_black_spring" 202 | }, 203 | { 204 | "name": "Published Tropic of Capricorn", 205 | "description": "Miller published 'Tropic of Capricorn' (banned in the United States).", 206 | "date": { 207 | "year": 1939, 208 | "note": "" 209 | }, 210 | "location": "Paris, France", 211 | "id": "henry_miller_published_tropic_of_capricorn" 212 | }, 213 | { 214 | "name": "Visited Greece", 215 | "description": "Miller visited Greece, invited by Lawrence Durrell who was living in Corfu.", 216 | "date": { 217 | "year": 1939, 218 | "note": "" 219 | }, 220 | "location": "Greece", 221 | "id": "henry_miller_visited_greece" 222 | }, 223 | { 224 | "name": "Returned to New York", 225 | "description": "Miller returned to New York.", 226 | "date": { 227 | "year": 1940, 228 | "note": "" 229 | }, 230 | "location": "New York City", 231 | "id": "henry_miller_returned_to_new_york" 232 | }, 233 | { 234 | "name": "Published The Colossus of Maroussi", 235 | "description": "Miller described his visit to Greece in 'The Colossus of Maroussi'.", 236 | "date": { 237 | "year": 1941, 238 | "note": "" 239 | }, 240 | "location": "United States", 241 | "id": "henry_miller_published_colossus_of_maroussi" 242 | }, 243 | { 244 | "name": "Moved to California", 245 | "description": "Miller moved to California in June, initially residing just outside Hollywood in Beverly Glen.", 246 | "date": { 247 | "year": 1942, 248 | "note": "June" 249 | }, 250 | "location": "California", 251 | "id": "henry_miller_moved_to_california" 252 | }, 253 | { 254 | "name": "Began writing Sexus", 255 | "description": "Miller began writing 'Sexus,' the first novel in 'The Rosy Crucifixion' trilogy.", 256 | "date": { 257 | "year": 1942, 258 | "note": "" 259 | }, 260 | "location": "California", 261 | "id": "henry_miller_began_sexus" 262 | }, 263 | { 264 | "name": "Settled in Big Sur", 265 | "description": "Miller settled in Big Sur.", 266 | "date": { 267 | "year": 1944, 268 | "note": "" 269 | }, 270 | "location": "Big Sur, California", 271 | "id": "henry_miller_settled_big_sur" 272 | }, 273 | { 274 | "name": "Married Janina Martha Lepska", 275 | "description": "Miller married Janina Martha Lepska.", 276 | "date": { 277 | "year": 1944, 278 | "note": "" 279 | }, 280 | "location": "United States", 281 | "id": "henry_miller_married_janina_lepska" 282 | }, 283 | { 284 | "name": "Published Sunday After the War", 285 | "description": "Miller published 'Sunday After the War'.", 286 | "date": { 287 | "year": 1944, 288 | "note": "" 289 | }, 290 | "location": "United States", 291 | "id": "henry_miller_published_sunday_after_the_war" 292 | }, 293 | { 294 | "name": "Published The Air-Conditioned Nightmare", 295 | "description": "Miller published 'The Air-Conditioned Nightmare'.", 296 | "date": { 297 | "year": 1945, 298 | "note": "" 299 | }, 300 | "location": "United States", 301 | "id": "henry_miller_published_air_conditioned_nightmare" 302 | }, 303 | { 304 | "name": "Lived in Big Sur with bohemian writers", 305 | "description": "Miller continued living in Big Sur with other bohemian writers.", 306 | "date": { 307 | "year": 1947, 308 | "note": "from 1947" 309 | }, 310 | "location": "Big Sur, California", 311 | "id": "henry_miller_lived_big_sur_bohemians" 312 | }, 313 | { 314 | "name": "Divorced Janina Martha Lepska", 315 | "description": "Miller was divorced from Janina Martha Lepska.", 316 | "date": { 317 | "year": 1952, 318 | "note": "" 319 | }, 320 | "location": "United States", 321 | "id": "henry_miller_divorced_janina" 322 | }, 323 | { 324 | "name": "Married artist Eve McClure", 325 | "description": "Miller married artist Eve McClure.", 326 | "date": { 327 | "year": 1953, 328 | "note": "" 329 | }, 330 | "location": "United States", 331 | "id": "henry_miller_married_eve_mcclure" 332 | }, 333 | { 334 | "name": "Published Big Sur and the Oranges of Hieronymus Bosch", 335 | "description": "Miller published 'Big Sur and the Oranges of Hieronymus Bosch'.", 336 | "date": { 337 | "year": 1957, 338 | "note": "" 339 | }, 340 | "location": "United States", 341 | "id": "henry_miller_published_big_sur_oranges" 342 | }, 343 | { 344 | "name": "Completed The Rosy Crucifixion trilogy", 345 | "description": "Miller completed 'The Rosy Crucifixion' trilogy (initially banned in the U.S., published in France and Japan).", 346 | "date": { 347 | "year": 1959, 348 | "note": "" 349 | }, 350 | "location": "United States", 351 | "id": "henry_miller_completed_rosy_crucifixion" 352 | }, 353 | { 354 | "name": "Divorced Eve McClure", 355 | "description": "Miller was divorced from Eve McClure.", 356 | "date": { 357 | "year": 1960, 358 | "note": "" 359 | }, 360 | "location": "United States", 361 | "id": "henry_miller_divorced_eve" 362 | }, 363 | { 364 | "name": "Reunion with June in New York", 365 | "description": "Miller arranged a reunion with ex-wife June in New York.", 366 | "date": { 367 | "year": 1961, 368 | "note": "" 369 | }, 370 | "location": "New York", 371 | "id": "henry_miller_reunion_with_june" 372 | }, 373 | { 374 | "name": "Tropic of Cancer published in the US", 375 | "description": "'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials.", 376 | "date": { 377 | "year": 1961, 378 | "note": "" 379 | }, 380 | "location": "United States", 381 | "id": "henry_miller_tropic_cancer_us_publication" 382 | }, 383 | { 384 | "name": "Moved to Pacific Palisades", 385 | "description": "Miller moved to 444 Ocampo Drive, Pacific Palisades, Los Angeles.", 386 | "date": { 387 | "year": 1963, 388 | "note": "" 389 | }, 390 | "location": "Pacific Palisades, Los Angeles, California", 391 | "id": "henry_miller_moved_pacific_palisades" 392 | }, 393 | { 394 | "name": "Married Hiroko Tokuda", 395 | "description": "Miller married Hiroko Tokuda.", 396 | "date": { 397 | "year": 1967, 398 | "note": "" 399 | }, 400 | "location": "United States", 401 | "id": "henry_miller_married_hiroko_tokuda" 402 | }, 403 | { 404 | "name": "Published On Turning Eighty", 405 | "description": "Miller published 'On Turning Eighty,' a chapbook with 200 copies.", 406 | "date": { 407 | "year": 1972, 408 | "note": "" 409 | }, 410 | "location": "United States", 411 | "id": "henry_miller_published_on_turning_eighty" 412 | }, 413 | { 414 | "name": "Nominated for Nobel Prize", 415 | "description": "Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip.", 416 | "date": { 417 | "year": 1973, 418 | "note": "" 419 | }, 420 | "location": "", 421 | "id": "henry_miller_nobel_nomination" 422 | }, 423 | { 424 | "name": "Divorced Hiroko Tokuda", 425 | "description": "Miller was divorced from Hiroko Tokuda.", 426 | "date": { 427 | "year": 1977, 428 | "note": "" 429 | }, 430 | "location": "United States", 431 | "id": "henry_miller_divorced_hiroko" 432 | }, 433 | { 434 | "name": "Henry Miller died", 435 | "description": "Henry Miller died of circulatory complications at home.", 436 | "date": { 437 | "year": 1980, 438 | "note": "June 7" 439 | }, 440 | "location": "Pacific Palisades, Los Angeles", 441 | "id": "henry_miller_died" 442 | } 443 | ], 444 | "url_events_summarized": " Henry Valentine Miller was born at his family's home, 450 East 85th Street, in the Yorkville section of Manhattan, New York City, U.S. He was the son of Lutheran German parents, Louise Marie (Neiting) and tailor Heinrich Miller. Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school While he was a socialist, his idol was the black Socialist Hubert Harrison Miller married his first wife, Beatrice Sylvas Wickens, in 1917;[11] their divorce was granted on December 21, 1923.[12] Together they had a daughter, Barbara, born in 1919" 445 | } 446 | --------------------------------------------------------------------------------