├── .python-version
├── media
├── kronologs-graph.webp
├── event-deep-research.webp
└── kronologs-lgstudiograph.webp
├── index.py
├── Makefile
├── .gitignore
├── .env.example
├── .github
└── repository.yml
├── langgraph.json
├── src
├── core
│ └── error_handling.py
├── services
│ ├── url_service.py
│ └── event_service.py
├── research_events
│ ├── merge_events
│ │ ├── show.json
│ │ ├── utils.py
│ │ ├── shortcategorized.json
│ │ ├── prompts.py
│ │ ├── fullcategorized.json
│ │ ├── merge_events_graph.py
│ │ └── test.json
│ ├── chunk_graph.py
│ ├── result.json
│ └── research_events_graph.py
├── url_crawler
│ ├── url_krawler_graph.py
│ ├── prompts.py
│ └── utils.py
├── llm_service.py
├── utils.py
├── configuration.py
├── test
│ ├── test_enhanced_merge_events.py
│ ├── test_url_crawler.py
│ ├── test_merge_events.py
│ └── test_research_events.py
├── prompts.py
├── state.py
└── graph.py
├── LICENSE.TXT
├── AGENTS.md
├── pyproject.toml
├── scripts
└── geocode.py
└── README.md
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 |
--------------------------------------------------------------------------------
/media/kronologs-graph.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bernatsampera/event-deep-research/HEAD/media/kronologs-graph.webp
--------------------------------------------------------------------------------
/media/event-deep-research.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bernatsampera/event-deep-research/HEAD/media/event-deep-research.webp
--------------------------------------------------------------------------------
/media/kronologs-lgstudiograph.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bernatsampera/event-deep-research/HEAD/media/kronologs-lgstudiograph.webp
--------------------------------------------------------------------------------
/index.py:
--------------------------------------------------------------------------------
1 | from langchain_core.messages import ToolMessage
2 |
3 | message = ToolMessage(content="Test", tool_call_id="123")
4 |
5 |
6 | print(message)
7 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | dev:
6 | source .venv/bin/activate && uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.12 langgraph dev --allow-blocking
7 |
8 |
9 |
10 | test:
11 | uv run pytest -v -s
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python-generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # Virtual environments
10 | .venv
11 |
12 |
13 | **/.vscode/
14 |
15 | **/.env
16 |
17 |
18 | backend/scripts/
19 | backend/src/data/*
20 |
21 | .langgraph_api/**
22 |
23 |
24 | src/data/*
25 |
26 | .langgraph_api/*
27 | .pytest_cache/*
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 |
2 | # Web scraping and search APIs
3 | FIRECRAWL_BASE_URL="https://api.firecrawl.dev"
4 | FIRECRAWL_API_KEY=""
5 | TAVILY_API_KEY=
6 |
7 | # LLM Provider API Keys (choose one or more)
8 | OPENAI_API_KEY=
9 | ANTHROPIC_API_KEY=
10 | GOOGLE_API_KEY=
11 |
12 | # Optional: Langfuse for observability
13 | LANGFUSE_PUBLIC_KEY=""
14 | LANGFUSE_SECRET_KEY=""
15 | LANGFUSE_HOST=""
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.github/repository.yml:
--------------------------------------------------------------------------------
1 | # GitHub repository metadata
2 | name: event-deep-research
3 | description: AI-powered agent that automatically researches historical figures and creates structured biographical timelines from web sources
4 | topics:
5 | - ai-agents
6 | - event-extraction
7 | - knowledge-graphs
8 | - web-scraping
9 | - langgraph
10 | - biographical-research
11 | - automated-research
12 | - llm-agents
13 | - timeline-generation
14 |
--------------------------------------------------------------------------------
/langgraph.json:
--------------------------------------------------------------------------------
1 | {
2 | "dockerfile_lines": [],
3 | "graphs": {
4 | "supervisor": "./src/graph.py:graph",
5 | "research_events": "./src/research_events/research_events_graph.py:research_events_app",
6 | "merge_events_graph": "./src/research_events/merge_events/merge_events_graph.py:merge_events_app",
7 | "url_crawler": "./src/url_crawler/url_krawler_graph.py:url_crawler_app",
8 | "chunk_graph": "./src/research_events/chunk_graph.py:graph"
9 | },
10 | "python_version": "3.12",
11 | "env": ".env",
12 | "dependencies": ["."],
13 | "auth": {}
14 | }
15 |
--------------------------------------------------------------------------------
/src/core/error_handling.py:
--------------------------------------------------------------------------------
1 | from functools import wraps
2 | from typing import Any, Dict
3 |
4 | from langgraph.types import Command
5 |
6 |
7 | class GraphError(Exception):
8 | def __init__(self, message: str, node: str, state: dict):
9 | self.message = message
10 | self.node = node
11 | self.state = state
12 | super().__init__(f"Error in {node}: {message}")
13 |
14 |
15 | def with_error_handling(func):
16 | @wraps(func)
17 | async def wrapper(state: Dict[str, Any], config) -> Command:
18 | try:
19 | return await func(state, config)
20 | except Exception as e:
21 | error_info = {
22 | "error": str(e),
23 | "node": func.__name__,
24 | "state_snapshot": state,
25 | }
26 | return Command(goto="error_handler", update=error_info)
27 |
28 | return wrapper
29 |
--------------------------------------------------------------------------------
/src/services/url_service.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlparse
2 | from typing import List
3 |
4 |
5 | class URLService:
6 | @staticmethod
7 | def extract_domain(url: str) -> str:
8 | """Extract domain from URL."""
9 | return urlparse(url).netloc
10 |
11 | @staticmethod
12 | def update_url_list(urls: List[str], used_domains: List[str]) -> tuple[List[str], List[str]]:
13 | """Remove first URL from list and track its domain."""
14 | if not urls:
15 | return urls, used_domains
16 |
17 | url = urls[0]
18 | domain = URLService.extract_domain(url)
19 |
20 | # Track used domains
21 | updated_used_domains = used_domains.copy()
22 | if domain not in updated_used_domains:
23 | updated_used_domains.append(domain)
24 |
25 | # Remove first URL
26 | remaining_urls = urls[1:]
27 |
28 | return remaining_urls, updated_used_domains
--------------------------------------------------------------------------------
/src/research_events/merge_events/show.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "name": "Henry Miller was born",
4 | "description": "Henry Valentine Miller was born on December 26 in New York City.",
5 | "date": {
6 | "year": 1891,
7 | "note": "December 26"
8 | },
9 | "location": "New York City",
10 | "id": "henry_miller_born",
11 | "new_information": "None"
12 | },
13 | {
14 | "name": "Divorced Beatrice Sylvas Wickens",
15 | "description": "Miller was divorced from Beatrice Sylvas Wickens.",
16 | "date": {
17 | "year": 1923,
18 | "note": "December 21"
19 | },
20 | "location": "New York",
21 | "id": "henry_miller_divorced_beatrice",
22 | "new_information": "None"
23 | },
24 | {
25 | "name": "Daughter Barbara born",
26 | "description": "Miller and Beatrice had a daughter named Barbara.",
27 | "date": {
28 | "year": 1919,
29 | "note": ""
30 | },
31 | "location": "New York",
32 | "id": "henry_miller_daughter_barbara_born",
33 | "new_information": "None"
34 | }
35 | ]
36 |
--------------------------------------------------------------------------------
/src/services/event_service.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from src.state import CategoriesWithEvents
3 |
4 |
5 | class EventService:
6 | @staticmethod
7 | def split_events_into_chunks(extracted_events: str, max_len: int = 2000) -> List[str]:
8 | """Split events text into chunks of specified length."""
9 | return [
10 | extracted_events[i : i + max_len]
11 | for i in range(0, len(extracted_events), max_len)
12 | ]
13 |
14 | @staticmethod
15 | def merge_categorized_events(categorized_results: List[CategoriesWithEvents]) -> CategoriesWithEvents:
16 | """Merge multiple categorized event results into one."""
17 | merged = CategoriesWithEvents(
18 | early="[]",
19 | personal="[]",
20 | career="[]",
21 | legacy="[]",
22 | )
23 |
24 | for result in categorized_results:
25 | merged.early += result.early
26 | merged.personal += result.personal
27 | merged.career += result.career
28 | merged.legacy += result.legacy
29 |
30 | return merged
--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Bernat Sampera
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
1 | # Development Guidelines for Deep Event Research
2 |
3 | ## Build/Test Commands
4 | - **Run all tests**: `make test` or `uv run pytest -v -s`
5 | - **Run single test**: `uv run pytest src/test/test_file.py::test_function -v`
6 | - **Run tests without LLM calls**: `uv run pytest -v -m 'not llm'`
7 | - **Run LLM integration tests**: `uv run pytest -v -m llm`
8 | - **Lint code**: `uv run ruff check src/`
9 | - **Format code**: `uv run ruff format src/`
10 | - **Start dev server**: `make dev`
11 |
12 | ## Code Style Guidelines
13 | - **Python**: 3.12+ with type hints required
14 | - **Imports**: Use `from src.module import name` for internal imports, standard library first
15 | - **Formatting**: Ruff with Google docstring convention
16 | - **Error handling**: Use `@with_error_handling` decorator for graph nodes, raise `GraphError` for known failures
17 | - **Async**: All graph functions must be async and return `Command`
18 | - **Testing**: Use pytest with asyncio mode, mock LLM calls by default, mark real LLM tests with `@pytest.mark.llm`
19 | - **State management**: Use TypedDict classes from `src.state.py` for all state objects
20 | - **Services**: Static methods in service classes, no instance state
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "backend"
3 | version = "0.1.0"
4 | description = "AI-powered agent that automatically researches historical figures and creates structured biographical timelines from web sources"
5 | readme = "README.md"
6 | requires-python = ">=3.12"
7 | dependencies = [
8 | "python-dotenv>=0.9.9",
9 | "langchain>=0.3.27",
10 | "langchain-google-genai>=2.1.9",
11 | "langgraph>=0.6.7",
12 | "langgraph-cli",
13 | "langchain-openai>=0.3.33",
14 | "langchain-ollama>=0.3.8",
15 | "langchain-tavily>=0.2.11",
16 | "tiktoken>=0.11.0",
17 | "pytest>=8.4.2",
18 | "pytest-asyncio>=0.24.0",
19 | "langfuse>=3.5.2",
20 | "aiohttp>=3.8.0",
21 | ]
22 |
23 | [build-system]
24 | requires = ["hatchling"]
25 | build-backend = "hatchling.build"
26 |
27 | [tool.hatch.build.targets.wheel]
28 | packages = ["src"]
29 |
30 | [tool.ruff]
31 | src = ["src"]
32 | lint.select = [
33 | "E", # pycodestyle
34 | "F", # pyflakes
35 | "I", # isort
36 | "D", # pydocstyle
37 | "D401", # First line should be in imperative mood
38 | "T201",
39 | "UP",
40 | ]
41 | lint.ignore = [
42 | "UP006",
43 | "UP007",
44 | "UP035",
45 | "D417",
46 | "E501",
47 | ]
48 |
49 | [tool.ruff.lint.per-file-ignores]
50 | "tests/*" = ["D", "UP"]
51 |
52 | [tool.ruff.lint.pydocstyle]
53 | convention = "google"
54 |
55 | [tool.pytest.ini_options]
56 | testpaths = ["src/test"]
57 | pythonpath = ["src"]
58 | asyncio_mode = "auto"
59 | addopts = "-v -m 'not llm'"
60 | markers = [
61 | "llm: marks tests that make real LLM API calls"
62 | ]
63 |
64 | [tool.setuptools.packages.find]
65 | where = ["src"]
66 |
67 |
--------------------------------------------------------------------------------
/src/research_events/merge_events/utils.py:
--------------------------------------------------------------------------------
1 | from typing import Type, TypeVar, Union
2 |
3 | from pydantic import BaseModel
4 | from src.state import CategoriesWithEvents
5 |
6 | T = TypeVar("T", bound=BaseModel)
7 |
8 |
9 | def ensure_pydantic_model(data: Union[dict, T], model_class: Type[T]) -> T:
10 | """Converts a dictionary to a Pydantic model instance if needed.
11 | If the data is already an instance of the model class, returns it as-is.
12 |
13 | Args:
14 | data: Either a dictionary or an instance of the Pydantic model
15 | model_class: The Pydantic model class to convert to
16 |
17 | Returns:
18 | An instance of the Pydantic model class
19 |
20 | Examples:
21 | # Convert dict to CategoriesWithEvents
22 | events = ensure_pydantic_model(some_dict, CategoriesWithEvents)
23 |
24 | # If already a model, returns as-is
25 | events = ensure_pydantic_model(existing_model, CategoriesWithEvents)
26 | """
27 | if isinstance(data, dict):
28 | return model_class(**data)
29 | elif isinstance(data, model_class):
30 | return data
31 | else:
32 | # Handle other cases - try to convert to dict first
33 | if hasattr(data, "__dict__"):
34 | return model_class(**data.__dict__)
35 | else:
36 | raise TypeError(f"Cannot convert {type(data)} to {model_class}")
37 |
38 |
39 | # This function is needed because sometimes the object comes as a dict and then it's tricky to access the variables.
40 | # There has to be a better way to do this in python, but this is the best I can come up with for now.
41 | def ensure_categories_with_events(
42 | data: Union[dict, CategoriesWithEvents],
43 | ) -> "CategoriesWithEvents":
44 | """Specifically converts data to CategoriesWithEvents model."""
45 | return ensure_pydantic_model(data, CategoriesWithEvents)
46 |
--------------------------------------------------------------------------------
/src/url_crawler/url_krawler_graph.py:
--------------------------------------------------------------------------------
1 | import random
2 | from typing import Literal, TypedDict
3 |
4 | from langgraph.graph import END, START, StateGraph
5 | from langgraph.graph.state import Command
6 | from src.configuration import Configuration
7 | from src.url_crawler.utils import url_crawl
8 | from src.utils import get_langfuse_handler
9 |
10 | config = Configuration()
11 | MAX_CONTENT_LENGTH = config.max_content_length
12 |
13 |
14 | class InputUrlCrawlerState(TypedDict):
15 | url: str
16 | research_question: str
17 |
18 |
19 | class UrlCrawlerState(InputUrlCrawlerState):
20 | raw_scraped_content: str
21 |
22 |
23 | class OutputUrlCrawlerState(UrlCrawlerState):
24 | extracted_events: str
25 | raw_scraped_content: str
26 |
27 |
28 | async def scrape_content(state: UrlCrawlerState) -> Command[Literal["__end__"]]:
29 | """Scrapes URL content and returns it without any processing."""
30 | url = state.get("url", "")
31 |
32 | content = await url_crawl(url)
33 |
34 | if len(content) > MAX_CONTENT_LENGTH:
35 | # At random start to get diverse content
36 | start_index = random.randint(0, len(content) - MAX_CONTENT_LENGTH)
37 | content = content[start_index : start_index + MAX_CONTENT_LENGTH]
38 |
39 | return Command(
40 | goto=END,
41 | update={
42 | "raw_scraped_content": content,
43 | "extracted_events": content, # For compatibility with existing interface
44 | },
45 | )
46 |
47 |
48 | builder = StateGraph(
49 | UrlCrawlerState,
50 | input_schema=InputUrlCrawlerState,
51 | output_schema=OutputUrlCrawlerState,
52 | config_schema=Configuration,
53 | )
54 |
55 | builder.add_node("scrape_content", scrape_content)
56 | builder.add_edge(START, "scrape_content")
57 |
58 |
59 | url_crawler_app = builder.compile().with_config({"callbacks": [get_langfuse_handler()]})
60 |
--------------------------------------------------------------------------------
/src/url_crawler/prompts.py:
--------------------------------------------------------------------------------
1 | # --- Prompt 1: For extracting events from a small text chunk ---
2 |
3 | # --- Prompt 1: For extracting events from a small text chunk ---
4 | EXTRACT_EVENTS_PROMPT = """
5 | You are a Biographical Event Extractor. Your single focus is to find events that directly answer the research question: **"{research_question}"**
6 |
7 |
8 |
9 | - `RelevantChunk` (use this if the text is almost entirely relevant (>80%))
10 | - `PartialChunk` (use this if the text is a mix of relevant and irrelevant content)
11 | - `IrrelevantChunk` (use this if the text contains no events that are relevant to the biography of the person in the researc question)
12 |
13 |
14 |
15 | **EXTRACTION RULE for `PartialChunk`**: You *must* extract the complete relevant sentences, including all details like dates, names, locations, and context. Do not summarize.
16 |
17 |
18 | {text_chunk}
19 |
20 |
21 | You must call exactly one of the provided tools. Do not respond with plain text.
22 | Choose only the tool call and the tool call arguments.
23 | """
24 | # src/url_crawler/prompts.py
25 |
26 | create_event_list_prompt = """You are a biographical assistant. Your task is to convert blocks of text that contains events of a person into single events where the date, description of the event, location of the event are included for {research_question}.
27 |
28 | **Instructions**:
29 | - Analyze the "New Extracted Events" and convert them into single events where the date, description of the event, location of the event are included.
30 | - **MAINTAIN** a chronological order.
31 |
32 | **Output Format**:
33 | - A single, comprehensive, and chronological list in bullet points.
34 |
35 |
36 | New Extracted Events:
37 | ----
38 | {newly_extracted_events}
39 |
40 |
41 |
42 |
45 | """
46 |
--------------------------------------------------------------------------------
/src/research_events/merge_events/shortcategorized.json:
--------------------------------------------------------------------------------
1 | {
2 | "existing_events": {
3 | "early": "- Henry Miller was born: Henry Valentine Miller was born on December 26 in New York City. (1891, New York City)\n- Attended Eastern District High School: Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school. (early 1900s, Williamsburg, Brooklyn)",
4 | "personal": "- Met June Mansfield: Miller met and became enamored of June Mansfield (born Juliet Edith Smerth) at a dance hall. (1923, New York)\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)",
5 | "career": "- Published Tropic of Cancer: Miller's first published book, 'Tropic of Cancer,' was published by Obelisk Press (banned in the United States). (1934, Paris, France)\n- Tropic of Cancer published in the US: 'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials. (1961, United States)",
6 | "legacy": "- Nominated for Nobel Prize: Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip. (1973, )\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)"
7 | },
8 | "new_events": "- Met Robert W. Service in Paris: Miller met author Robert W. Service on a Paris street and discussed books, with Service recalling the encounter in his autobiography. (1928, Paris)\n- Financial support from Anaïs Nin: Anaïs Nin, with Hugh Guiler, financed Miller's living expenses through the 1930s, including rent for an apartment at 18 Villa Seurat, and funded the first printing of Tropic of Cancer in 1934. (1930s, Paris)\n- Learned about and promoted George Dibbern: During the late 1930s, Miller learned about German-born sailor George Dibbern, helped promote his memoir Quest, and organized charity to help him. (late 1930s, France)\n- Wrote The Smile at the Foot of the Ladder: In 1948, Miller wrote a novella he called his 'most singular story,' The Smile at the Foot of the Ladder. (1948, California)"
9 | }
10 |
--------------------------------------------------------------------------------
/scripts/geocode.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | # Input JSON file from current directory
4 | import os
5 | import time
6 |
7 | import requests
8 |
9 | INPUT_FILE = os.path.join(os.path.dirname(__file__), "events.json")
10 | OUTPUT_FILE = os.path.join(os.path.dirname(__file__), "events_with_coords.json")
11 |
12 |
13 | # Geocoding function using OpenStreetMap Nominatim
14 | def geocode_location(location_name: str):
15 | url = "https://nominatim.openstreetmap.org/search"
16 | params = {"q": location_name, "format": "json", "limit": 1}
17 | headers = {"User-Agent": "EventGeocoder/1.0"}
18 |
19 | try:
20 | response = requests.get(url, params=params, headers=headers)
21 | response.raise_for_status()
22 | data = response.json()
23 | if data:
24 | return float(data[0]["lat"]), float(data[0]["lon"])
25 | except Exception as e:
26 | print(f"Error geocoding {location_name}: {e}")
27 |
28 | return None, None
29 |
30 |
31 | def main():
32 | # Load events
33 | with open(INPUT_FILE, encoding="utf-8") as f:
34 | events = json.load(f)
35 |
36 | # Process each event
37 | for i, event in enumerate(events, 1):
38 | # Add auto-incremental ID
39 | event["id"] = i
40 |
41 | # Only process location if it's not empty
42 | if isinstance(event.get("location"), str) and event["location"].strip():
43 | loc_name = event["location"]
44 | print(f"Geocoding: {loc_name} ...")
45 | lat, lng = geocode_location(loc_name)
46 | if lat and lng:
47 | event["location"] = {"name": loc_name, "lat": lat, "lng": lng}
48 | else:
49 | event["location"] = {"name": loc_name, "lat": None, "lng": None}
50 |
51 | # Be polite to the API
52 | time.sleep(1)
53 | else:
54 | # Remove location field entirely for empty locations
55 | if "location" in event:
56 | del event["location"]
57 |
58 | # Save new JSON
59 | with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
60 | json.dump(events, f, indent=2, ensure_ascii=False)
61 |
62 | print(f"✅ Saved updated events to {OUTPUT_FILE}")
63 |
64 |
65 | if __name__ == "__main__":
66 | main()
67 |
--------------------------------------------------------------------------------
/src/research_events/chunk_graph.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, TypedDict
2 |
3 | from langgraph.graph import END, START, StateGraph
4 | from langgraph.graph.state import CompiledStateGraph
5 | from pydantic import BaseModel, Field
6 | from src.configuration import Configuration
7 | from src.llm_service import create_llm_chunk_model
8 |
9 |
10 | class BiographicEventCheck(BaseModel):
11 | contains_biographic_event: bool = Field(
12 | description="Whether the text chunk contains biographical events"
13 | )
14 |
15 |
16 | class ChunkResult(BaseModel):
17 | content: str
18 | contains_biographic_event: bool = Field(
19 | description="Whether the text chunk contains biographical events"
20 | )
21 |
22 |
23 | class ChunkState(TypedDict):
24 | text: str
25 | chunks: List[str]
26 | results: Dict[str, ChunkResult]
27 |
28 |
29 | def split_text(state: ChunkState) -> ChunkState:
30 | """Split text into smaller chunks."""
31 | text = state["text"]
32 | chunk_size = 2000
33 | chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
34 | return {"chunks": chunks}
35 |
36 |
37 | def check_chunk_for_events(state: ChunkState, config) -> ChunkState:
38 | """Check each chunk for biographical events using structured output."""
39 | model = create_llm_chunk_model(config, BiographicEventCheck)
40 | results = {}
41 |
42 | for i, chunk in enumerate(state["chunks"]):
43 | prompt = f"""
44 | Analyze this text chunk and determine if it contains SPECIFIC biographical events.
45 |
46 | ONLY mark as true if the chunk contains:
47 | - Birth/death dates or locations
48 | - Marriage ceremonies or relationships
49 | - Educational enrollment or graduation
50 | - Career appointments or job changes
51 | - Awards, prizes, or honors received
52 | - Relocations to new cities/countries
53 | - Major discoveries or inventions
54 |
55 | DO NOT mark as true for:
56 | - General descriptions or background information
57 | - Character traits or personality descriptions
58 | - General statements about time periods
59 | - Descriptions of places without personal connection
60 | - General knowledge or context
61 |
62 | The event must be specific and concrete, not general background.
63 |
64 | Text chunk: "{chunk}"
65 | """
66 |
67 | result = model.invoke(prompt)
68 | results[f"chunk_{i}"] = ChunkResult(
69 | content=chunk, contains_biographic_event=result.contains_biographic_event
70 | )
71 |
72 | return {"results": results}
73 |
74 |
75 | def create_biographic_event_graph() -> CompiledStateGraph:
76 | """Create and return the biographic event detection graph."""
77 | graph = StateGraph(ChunkState, config_schema=Configuration)
78 |
79 | graph.add_node("split_text", split_text)
80 | graph.add_node("check_events", check_chunk_for_events)
81 |
82 | graph.add_edge(START, "split_text")
83 | graph.add_edge("split_text", "check_events")
84 | graph.add_edge("check_events", END)
85 |
86 | return graph.compile()
87 |
88 |
89 | graph = create_biographic_event_graph()
90 |
--------------------------------------------------------------------------------
/src/llm_service.py:
--------------------------------------------------------------------------------
1 | from typing import List, Type
2 |
3 | from langchain.chat_models import init_chat_model
4 | from langchain_core.runnables import Runnable, RunnableConfig
5 | from langchain_core.tools import BaseTool
6 | from pydantic import BaseModel
7 | from src.configuration import Configuration
8 | from src.utils import get_api_key_for_model
9 |
10 | configurable_model = init_chat_model(
11 | configurable_fields=("model", "max_tokens", "api_key", "reasoning")
12 | )
13 |
14 |
15 | # This contains the shared logic. The underscore _ means other files shouldn't use it.
16 | def _build_and_configure_model(
17 | config: RunnableConfig,
18 | model_chain: Runnable,
19 | model_name: str,
20 | max_tokens: int,
21 | max_retries: int,
22 | ) -> Runnable:
23 | """Internal helper to apply retry and runtime configuration."""
24 | model_config = {
25 | "model": model_name,
26 | "max_tokens": max_tokens,
27 | "api_key": get_api_key_for_model(model_name, config),
28 | "reasoning": "False",
29 | }
30 | return model_chain.with_retry(stop_after_attempt=max_retries).with_config(
31 | model_config
32 | )
33 |
34 |
35 | # --- Public Function 1: For Models WITH Tools ---
36 | def create_llm_with_tools(
37 | tools: List[Type[BaseTool]], config: RunnableConfig
38 | ) -> Runnable:
39 | """Creates a model configured specifically for tool-calling."""
40 | configurable = Configuration.from_runnable_config(config)
41 |
42 | # Start the chain by binding the tools
43 | model_with_tools = configurable_model.bind_tools(tools)
44 |
45 | return _build_and_configure_model(
46 | config=config,
47 | model_chain=model_with_tools,
48 | model_name=configurable.get_llm_with_tools_model(),
49 | max_tokens=configurable.tools_llm_max_tokens,
50 | max_retries=configurable.max_tools_output_retries,
51 | )
52 |
53 |
54 | # --- Public Function 2: For Models WITHOUT Tools ---
55 | def create_llm_structured_model(
56 | config: RunnableConfig, class_name: Type[BaseModel] | None = None
57 | ) -> Runnable:
58 | """Creates a general-purpose chat model with no tools."""
59 | configurable = Configuration.from_runnable_config(config)
60 |
61 | # The chain is just the base model itself
62 | if class_name:
63 | base_model = configurable_model.with_structured_output(class_name)
64 | else:
65 | base_model = configurable_model
66 |
67 | return _build_and_configure_model(
68 | config=config,
69 | model_chain=base_model,
70 | model_name=configurable.get_llm_structured_model(),
71 | max_tokens=configurable.structured_llm_max_tokens,
72 | max_retries=configurable.max_structured_output_retries,
73 | )
74 |
75 |
76 | # --- Public Function 3: For Small Chunk Models ---
77 | def create_llm_chunk_model(
78 | config: RunnableConfig, class_name: Type[BaseModel] | None = None
79 | ) -> Runnable:
80 | """Creates a small model for chunk biographical event detection."""
81 | configurable = Configuration.from_runnable_config(config)
82 |
83 | # The chain is just the base model itself
84 | if class_name:
85 | base_model = configurable_model.with_structured_output(class_name)
86 | else:
87 | base_model = configurable_model
88 |
89 | return _build_and_configure_model(
90 | config=config,
91 | model_chain=base_model,
92 | model_name=configurable.get_llm_chunk_model(),
93 | max_tokens=1024, # Smaller token limit for chunk processing
94 | max_retries=2, # Fewer retries for chunk processing
95 | )
96 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from langchain_core.messages import (
4 | AIMessage,
5 | BaseMessage,
6 | HumanMessage,
7 | SystemMessage,
8 | ToolMessage,
9 | )
10 | from langchain_core.runnables import RunnableConfig
11 | from langchain_core.tools import tool
12 |
13 |
14 | @tool(
15 | description="Mandatory reflection tool. Analyze results and plan the next search query."
16 | )
17 | def think_tool(reflection: str) -> str:
18 | """Mandatory reflection step. Use this to analyze the last result, identify gaps, and formulate the EXACT query for the next search.
19 |
20 | You MUST use this tool immediately after every ResearchEventsTool call.
21 |
22 | Analyze if an additional call to the ResearchEventsTool is needed to fill the gaps or the research is completed. When is completed, you must call the FinishResearchTool.
23 |
24 | The `reflection` argument must follow the structure defined in the system prompt, culminating in the precise search query you will use next.
25 |
26 | Args:
27 | reflection: Structured analysis of the last result, current gaps, and the PLANNED QUERY for the next step.
28 |
29 | Returns:
30 | Confirmation and instruction to proceed to the next step.
31 | """
32 | # The return value is crucial. It becomes the ToolMessage the LLM sees next.
33 | # By explicitly telling it what to do, we break the loop.
34 | return f"Reflection recorded. {reflection}"
35 |
36 |
37 | def get_api_key_for_model(model_name: str, config: RunnableConfig):
38 | """Get API key for a specific model from environment or config."""
39 | model_name = model_name.lower()
40 |
41 | if model_name.startswith("openai:"):
42 | return os.getenv("OPENAI_API_KEY")
43 | elif model_name.startswith("anthropic:"):
44 | return os.getenv("ANTHROPIC_API_KEY")
45 | elif model_name.startswith("google"):
46 | print("GOOGLE_API_KEY", os.getenv("GOOGLE_API_KEY"))
47 | return os.getenv("GOOGLE_API_KEY")
48 | elif model_name.startswith("ollama:"):
49 | # Ollama doesn't need API key
50 | return None
51 | return None
52 |
53 |
54 | def get_buffer_string_with_tools(messages: list[BaseMessage]) -> str:
55 | """Return a readable transcript showing roles, including tool names for ToolMessages."""
56 | lines = []
57 | for m in messages:
58 | if isinstance(m, HumanMessage):
59 | lines.append(f"Human: {m.content}")
60 | elif isinstance(m, AIMessage):
61 | ai_content = f"AI: {m.content}"
62 | # Include tool calls if present
63 | if hasattr(m, "tool_calls") and m.tool_calls:
64 | tool_calls_str = ", ".join(
65 | [
66 | f"{tc.get('name', 'unknown')}({tc.get('args', {})})"
67 | for tc in m.tool_calls
68 | ]
69 | )
70 | ai_content += f" [Tool calls: {tool_calls_str}]"
71 | lines.append(ai_content)
72 | elif isinstance(m, SystemMessage):
73 | lines.append(f"System: {m.content}")
74 | elif isinstance(m, ToolMessage):
75 | # Include tool name if available
76 | tool_name = (
77 | getattr(m, "name", None) or getattr(m, "tool", None) or "unknown_tool"
78 | )
79 | lines.append(f"Tool[{tool_name}]: {m.content}")
80 | else:
81 | # fallback for unknown or custom message types
82 | lines.append(f"{m.__class__.__name__}: {m.content}")
83 | return "\n".join(lines)
84 |
85 |
86 | def get_langfuse_handler():
87 | try:
88 | from langfuse.langchain import CallbackHandler
89 |
90 | return CallbackHandler()
91 | except ImportError:
92 | return None
93 |
--------------------------------------------------------------------------------
/src/url_crawler/utils.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import re
4 | from typing import List
5 |
6 | import aiohttp
7 | import tiktoken
8 |
9 | FIRECRAWL_API_URL = (
10 | f"{os.getenv('FIRECRAWL_BASE_URL', 'https://api.firecrawl.dev')}/v0/scrape"
11 | )
12 |
13 |
14 | async def url_crawl(url: str) -> str:
15 | """Crawls a URL and returns its content. For this example, returns dummy text."""
16 | # print(f"--- FAKE CRAWLING: {url} ---")
17 | # if "wikipedia" in url:
18 | # return "Henry Miller was an American novelist, short story writer and essayist. He was born in Yorkville, NYC on December 26, 1891. He moved to Paris in 1930. He wrote tropic of cancer, part of his series of novels about his life."
19 |
20 | content = await scrape_page_content(url)
21 | if content is None:
22 | return ""
23 | return remove_markdown_links(content)
24 |
25 |
26 | async def scrape_page_content(url):
27 | """Scrapes URL using Firecrawl API and returns Markdown content."""
28 | try:
29 | headers = {"Content-Type": "application/json"}
30 |
31 | # Add API key if available
32 | api_key = os.getenv("FIRECRAWL_API_KEY")
33 | if api_key:
34 | headers["Authorization"] = f"Bearer {api_key}"
35 |
36 | async with aiohttp.ClientSession() as session:
37 | async with session.post(
38 | FIRECRAWL_API_URL,
39 | json={
40 | "url": url,
41 | "pageOptions": {"onlyMainContent": True},
42 | "formats": ["markdown"],
43 | },
44 | headers=headers,
45 | timeout=aiohttp.ClientTimeout(total=30),
46 | ) as response:
47 | response.raise_for_status()
48 | data = await response.json()
49 | return data.get("data", {}).get("markdown")
50 | except Exception as e:
51 | print(f"Error scraping page content: {e}")
52 | return None
53 |
54 |
55 | def remove_markdown_links(markdown_text):
56 | """Removes Markdown links, keeping only display text."""
57 | return re.sub(r"\[(.*?)\]\(.*?\)", r"\1", markdown_text)
58 |
59 |
60 | # Global tokenizer cache to avoid repeated loading
61 | _tokenizer = None
62 |
63 |
64 | def get_tokenizer():
65 | """Get the tiktoken tokenizer, loading it lazily."""
66 | global _tokenizer
67 | if _tokenizer is None:
68 | _tokenizer = tiktoken.get_encoding("cl100k_base")
69 | return _tokenizer
70 |
71 |
72 | async def chunk_text_by_tokens(
73 | text: str, chunk_size: int = 1000, overlap_size: int = 20
74 | ) -> List[str]:
75 | """Splits text into token-based, overlapping chunks."""
76 | if not text:
77 | return []
78 |
79 | # Load tokenizer in a thread to avoid blocking
80 | encoding = await asyncio.to_thread(get_tokenizer)
81 | tokens = encoding.encode(text)
82 | print("--- TOKENS ---")
83 | print(len(tokens))
84 | print("--- TOKENS ---")
85 | chunks = []
86 | start_index = 0
87 | while start_index < len(tokens):
88 | end_index = start_index + chunk_size
89 | chunk_tokens = tokens[start_index:end_index]
90 | chunks.append(encoding.decode(chunk_tokens))
91 | start_index += chunk_size - overlap_size
92 |
93 | print(f"""
94 | CHUNKED TEXT ---
95 | Chunks:{len(chunks)}
96 | Tokens: {len(tokens)}
97 | Text: {len(text)}
98 | ---
99 | """)
100 | return chunks
101 |
102 |
103 | async def count_tokens(messages: List[str]) -> int:
104 | """Counts the total tokens in a list of messages."""
105 | # Load tokenizer in a thread to avoid blocking
106 | encoding = await asyncio.to_thread(get_tokenizer)
107 | return sum(len(encoding.encode(msg)) for msg in messages)
108 |
--------------------------------------------------------------------------------
/src/research_events/merge_events/prompts.py:
--------------------------------------------------------------------------------
1 | categorize_events_prompt = """
2 | You are a helpful assistant that will categorize the events into the 4 categories.
3 |
4 |
5 | {events}
6 |
7 |
8 |
9 | early: Covers childhood, upbringing, family, education, and early influences that shaped the author.
10 | personal: Focuses on relationships, friendships, family life, places of residence, and notable personal traits or beliefs.
11 | career: Details their professional journey: first steps into writing, major publications, collaborations, recurring themes, style, and significant milestones.
12 | legacy: Explains how their work was received, awards or recognition, cultural/literary impact, influence on other authors, and they are remembered today.
13 |
14 |
15 |
16 |
17 | INCLUDE ALL THE INFORMATION FROM THE EVENTS, do not abbreviate or omit any information.
18 |
19 | """
20 |
21 | EXTRACT_AND_CATEGORIZE_PROMPT = """
22 | You are a Biographical Event Extractor and Categorizer. Your task is to analyze text chunks for events related to the life of the historical figure**
23 |
24 |
25 | - `IrrelevantChunk` (use if the text contains NO biographical events relevant to the research question)
26 | - `RelevantEventsCategorized` (use if the text contains relevant events - categorize them into the 4 categories)
27 |
28 |
29 |
30 | early: Covers childhood, upbringing, family, education, and early influences that shaped the author.
31 | personal: Focuses on relationships, friendships, family life, places of residence, and notable personal traits or beliefs.
32 | career: Details their professional journey: first steps into writing, major publications, collaborations, recurring themes, style, and significant milestones.
33 | legacy: Explains how their work was received, awards or recognition, cultural/literary impact, influence on other authors, and how they are remembered today.
34 |
35 |
36 | **EXTRACTION RULES**:
37 | - Extract COMPLETE sentences with ALL available details (dates, names, locations, context, emotions, motivations)
38 | - Include surrounding context that makes the event meaningful and complete
39 | - Preserve the original narrative flow and descriptive language
40 | - Capture cause-and-effect relationships and consequences
41 | - Include only events directly relevant to the research question
42 | - Maintain chronological order within each category
43 | - Format as clean bullet points with complete, detailed descriptions (e.g., "- In the spring of 1965, while living in a small apartment in Paris, she attended a poetry reading that fundamentally changed her approach to writing, inspiring her to experiment with free verse.")
44 | - IMPORTANT: Return each category as a SINGLE string containing all bullet points, not as a list
45 |
46 |
47 | {text_chunk}
48 |
49 |
50 | You must call exactly one of the provided tools. Do not respond with plain text.
51 | """
52 |
53 |
54 | MERGE_EVENTS_TEMPLATE = """You are a helpful assistant that will merge two lists of events:
55 | the original events (which must always remain) and new events (which may contain extra details).
56 | The new events should only be treated as additions if they provide relevant new information.
57 | The final output must preserve the original events and seamlessly add the new ones if applicable.
58 |
59 |
60 | - Always include the original events exactly, do not omit or alter them.
61 | - Add new events only if they are not duplicates, combining details if they overlap.
62 | - Format the final list as bullet points, one event per line (e.g., "- Event details.").
63 | - Keep the list clean, concise, and without commentary.
64 |
65 |
66 |
67 | Original events:
68 | {original}
69 |
70 | New events:
71 | {new}
72 |
73 |
74 | """
77 |
--------------------------------------------------------------------------------
/src/configuration.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Any
3 |
4 | from langchain_core.runnables import RunnableConfig
5 | from pydantic import BaseModel, Field
6 |
7 |
8 | class Configuration(BaseModel):
9 | """Main configuration class for the Deep Research agent."""
10 |
11 | # Single model for most providers (simplified configuration)
12 | llm_model: str = Field(
13 | default="google_genai:gemini-2.5-flash",
14 | description="Primary LLM model to use for both structured output and tools",
15 | )
16 |
17 | # Optional overrides
18 | structured_llm_model: str | None = Field(
19 | default=None,
20 | # default="ollama:mistral-nemo:latest",
21 | description="Override model for structured output",
22 | )
23 | tools_llm_model: str | None = Field(
24 | default=None,
25 | # default="ollama:gpt-oss:20b",
26 | description="Override model for tools",
27 | )
28 | chunk_llm_model: str | None = Field(
29 | default=None,
30 | # default="ollama:gemma3:4b",
31 | description="Small model for chunk biographical event detection",
32 | )
33 |
34 | structured_llm_max_tokens: int = Field(
35 | default=4096, description="Maximum tokens for structured output model"
36 | )
37 | tools_llm_max_tokens: int = Field(
38 | default=4096, description="Maximum tokens for tools model"
39 | )
40 |
41 | max_structured_output_retries: int = Field(
42 | default=3, description="Maximum retry attempts for structured output"
43 | )
44 | max_tools_output_retries: int = Field(
45 | default=3, description="Maximum retry attempts for tool calls"
46 | )
47 |
48 | # Hardcoded values from graph files
49 | default_chunk_size: int = Field(
50 | default=800, description="Default chunk size for text processing"
51 | )
52 | default_overlap_size: int = Field(
53 | default=20, description="Default overlap size between chunks"
54 | )
55 | max_content_length: int = Field(
56 | default=100000, description="Maximum content length to process"
57 | )
58 | max_tool_iterations: int = Field(
59 | default=5, description="Maximum number of tool iterations"
60 | )
61 | max_chunks: int = Field(
62 | default=20,
63 | description="Maximum number of chunks to process for biographical event detection",
64 | )
65 |
66 | def get_llm_structured_model(self) -> str:
67 | """Get the LLM structured model, using overrides if provided."""
68 | print(f"Getting LLM structured model: {self.structured_llm_model}")
69 | if self.structured_llm_model:
70 | return self.structured_llm_model
71 |
72 | return self.llm_model
73 |
74 | def get_llm_with_tools_model(self) -> str:
75 | """Get the LLM with tools model, using overrides if provided."""
76 | print(f"Getting LLM with tools model: {self.tools_llm_model}")
77 | if self.tools_llm_model:
78 | return self.tools_llm_model
79 |
80 | return self.llm_model
81 |
82 | def get_llm_chunk_model(self) -> str:
83 | """Get the LLM chunk model, using overrides if provided."""
84 | if self.chunk_llm_model:
85 | return self.chunk_llm_model
86 | return "ollama:gemma3:4b"
87 |
88 | @classmethod
89 | def from_runnable_config(
90 | cls, config: RunnableConfig | None = None
91 | ) -> "Configuration":
92 | """Create a Configuration instance from a RunnableConfig."""
93 | configurable = config.get("configurable", {}) if config else {}
94 | field_names = list(cls.model_fields.keys())
95 | values: dict[str, Any] = {
96 | field_name: os.environ.get(field_name.upper(), configurable.get(field_name))
97 | for field_name in field_names
98 | }
99 | return cls(**{k: v for k, v in values.items() if v is not None})
100 |
--------------------------------------------------------------------------------
/src/test/test_enhanced_merge_events.py:
--------------------------------------------------------------------------------
1 | """Tests for the enhanced merge events graph."""
2 |
3 | from unittest.mock import AsyncMock, Mock, patch
4 |
5 | import pytest
6 | from src.state import CategoriesWithEvents
7 | from src.research_events.merge_events.merge_events_graph import merge_events_app
8 |
9 |
10 | @pytest.fixture
11 | def sample_merge_input_state() -> dict:
12 | """Provide a sample input state for the enhanced merge events graph."""
13 | return {
14 | "existing_events": CategoriesWithEvents(
15 | early="Born in 1920 in Paris.",
16 | personal="Married in 1945.",
17 | career="Published first novel in 1950.",
18 | legacy="Won Nobel Prize in 1980.",
19 | ),
20 | "extracted_events": "He was born in New York City in 1920 and started writing at age 15. He published his first book in 1950 and won the Pulitzer Prize in 1985.",
21 | "research_question": "Research the life of the author",
22 | }
23 |
24 |
25 | class MockToolCall:
26 | """Mock tool call for structured LLM responses."""
27 |
28 | def __init__(self, name, args):
29 | """Initialize mock tool call with name and args."""
30 | self.name = name
31 | self.args = args
32 |
33 | def __getitem__(self, key):
34 | """Make the mock tool call subscriptable."""
35 | if key == "name":
36 | return self.name
37 | elif key == "args":
38 | return self.args
39 | else:
40 | raise KeyError(f"Key {key} not found in MockToolCall")
41 |
42 |
43 | class MockToolResponse:
44 | """Mock tool response for structured LLM responses."""
45 |
46 | def __init__(self, tool_calls=None):
47 | """Initialize mock tool response with tool calls."""
48 | self.tool_calls = tool_calls or []
49 |
50 |
51 | @pytest.mark.asyncio
52 | async def test_enhanced_merge_events_with_mocked_llm(sample_merge_input_state: dict):
53 | """Unit test for the enhanced merge events graph with mocked dependencies."""
54 | # --- Act: Execute the graph with patched dependencies ---
55 | with patch(
56 | "src.research_events.merge_events.merge_events_graph.create_tools_model"
57 | ) as mock_tools_model:
58 |
59 | # Mock the tools model response for categorization
60 | mock_tools_response = MockToolResponse([
61 | MockToolCall("RelevantEventsCategorized", {
62 | "early": "- Born in New York City in 1920",
63 | "personal": "",
64 | "career": "- Published first book in 1950",
65 | "legacy": "- Won Pulitzer Prize in 1985",
66 | })
67 | ])
68 | mock_tools_instance = AsyncMock()
69 | mock_tools_instance.ainvoke.return_value = mock_tools_response
70 | mock_tools_model.return_value = mock_tools_instance
71 |
72 | result = await merge_events_app.ainvoke(sample_merge_input_state)
73 |
74 | # --- Assert: Verify the output ---
75 | assert "existing_events" in result
76 | existing_events = result["existing_events"]
77 | assert isinstance(existing_events, CategoriesWithEvents)
78 |
79 | # Verify that the tools model was called for categorization
80 | mock_tools_instance.ainvoke.assert_called()
81 |
82 |
83 | @pytest.mark.asyncio
84 | async def test_enhanced_merge_events_with_empty_content():
85 | """Test enhanced merge events with empty extracted content."""
86 | input_state = {
87 | "existing_events": CategoriesWithEvents(
88 | early="Born in 1920.",
89 | personal="Married in 1945.",
90 | career="Published in 1950.",
91 | legacy="Won prize in 1980.",
92 | ),
93 | "extracted_events": "", # Empty content
94 | "research_question": "Test question",
95 | }
96 |
97 | result = await merge_events_app.ainvoke(input_state)
98 |
99 | # Should return existing events unchanged
100 | assert "existing_events" in result
101 | existing_events = result["existing_events"]
102 | assert existing_events.early == "Born in 1920."
103 | assert existing_events.personal == "Married in 1945."
104 | assert existing_events.career == "Published in 1950."
105 | assert existing_events.legacy == "Won prize in 1980."
--------------------------------------------------------------------------------
/src/prompts.py:
--------------------------------------------------------------------------------
1 | lead_researcher_prompt = """
2 | You are a meticulous research agent. Your primary directive is to follow a strict, state-based execution cycle to build a comprehensive event timeline for: **{person_to_research}**.
3 |
4 | **
5 | On every turn, you MUST follow these steps in order:
6 |
7 | 1. **Step 1: Check for Completion.**
8 | * Examine the ``. If it explicitly states the research is COMPLETE, you MUST immediately call the `FinishResearchTool` and stop.
9 |
10 |
11 | **CRITICAL CONSTRAINTS:**
12 | * NEVER call `ResearchEventsTool` twice in a row.
13 | * NEVER call `think_tool` twice in a row.
14 | * ALWAYS call exactly ONE tool per turn.
15 |
16 |
17 | {events_summary}
18 |
19 |
20 |
21 | {last_message}
22 |
23 |
24 |
25 |
26 |
27 | ****
28 | * `ResearchEventsTool`: Finds events about the historical figure.
29 | * `FinishResearchTool`: Ends the research process. Call this ONLY when the research is complete
30 | * `think_tool`: Use this to analyze results and plan the EXACT search query for your next action.
31 |
32 | **CRITICAL: Use think_tool before calling ResearchEventsTool to plan your approach, and after each ResearchEventsTool to assess progress. Do not call think_tool two times in a row.**
33 |
34 |
35 | 1. **Top Priority Gap:** Identify the SINGLE most important missing piece of information from the ``.
36 | 2 **Planned Query:** Write the EXACT search query you will use in the next `ResearchEventsTool` call to fill that gap.
37 |
38 | **CRITICAL:** Execute ONLY ONE tool call now, following the ``.
39 | """
40 |
41 |
42 | create_messages_summary_prompt = """You are a specialized assistant that maintains a summary of the conversation between the user and the assistant.
43 |
44 |
45 | 1. AI Call: Order to call the ResearchEventsTool, the assistant asked the user for the research question.
46 | 2. Tool Call: The assistant called the ResearchEventsTool with the research question.
47 | 3. AI Call: Order to call think_tool to analyze the results and plan the next action.
48 | 4. Tool Call: The assistant called the think_tool.
49 | ...
50 |
51 |
52 |
53 | {previous_messages_summary}
54 |
55 |
56 |
57 | {new_messages}
58 |
59 |
60 |
61 | Return just the new log entry with it's corresponding number and content.
62 | Do not include Ids of tool calls
63 |
64 |
65 |
66 | X.
67 |
68 |
69 | Output:
70 | """
71 |
72 |
73 | events_summarizer_prompt = """
74 | Analyze the following events and identify only the 2 biggest gaps in information. Be brief and general.
75 |
76 | **Events:**
77 | {existing_events}
78 |
79 |
82 |
83 | **Gaps:**
84 | """
85 |
86 |
87 | structure_events_prompt = """You are a data processing specialist. Your sole task is to convert a pre-cleaned, chronologically ordered list of life events into a structured JSON object.
88 |
89 |
90 | You will be given a list of events that is already de-duplicated and ordered. You must not change the order or content of the events. For each event in the list, you will extract its name, a detailed description, its date, and location, and format it as JSON.
91 |
92 |
93 |
94 | 1. For the `name` field, create a short, descriptive title for the event (e.g., "Birth of Pablo Picasso").
95 | 2. For the `description` field, provide the clear and concise summary of what happened from the input text.
96 | 3. For the `date` field, populate `year`, `month`, and `day` whenever possible.
97 | 4. If the date is an estimate or a range (e.g., "circa 1912" or "Between 1920-1924"), you MUST capture that specific text in the `note` field of the date object, and provide your best estimate for the `year`.
98 | 5. For the `location` field, populate the location of the event, leave blank if not mentioned
99 |
100 |
101 |
102 | ----
103 | {existing_events}
104 | ----
105 |
106 |
107 | CRITICAL: You must only return the structured JSON output. Do not add any commentary, greetings, or explanations before or after the JSON.
108 | """
109 |
--------------------------------------------------------------------------------
/src/test/test_url_crawler.py:
--------------------------------------------------------------------------------
1 | # tests/url_crawler/test_url_crawler.py
2 |
3 | """Tests for the url_krawler_graph."""
4 |
5 | from unittest.mock import AsyncMock, patch
6 |
7 | import pytest
8 |
9 | # Imports are relative to the src directory (configured in pyproject.toml pythonpath)
10 | from url_crawler.url_krawler_graph import url_crawler_app
11 |
12 |
13 | @pytest.fixture
14 | def sample_input_state() -> dict:
15 | """Provide a sample input state for the url_crawler_app graph."""
16 | return {
17 | "url": "https://www.britannica.com/biography/Henry-Miller",
18 | "research_question": "Research the life of Henry Miller",
19 | }
20 |
21 |
22 | @pytest.fixture
23 | def mock_scraped_content():
24 | """Provide mock scraped content for testing."""
25 | return """
26 | Henry Miller was an American novelist, short story writer and essayist.
27 | He was born in Yorkville, NYC on December 26, 1891.
28 | He moved to Paris in 1930 where he lived for many years.
29 | He wrote Tropic of Cancer, part of his series of novels about his life.
30 | He married his first wife Beatrice in 1917.
31 | He had a daughter named Barbara in 1919.
32 | He divorced Beatrice in 1924.
33 | He married his second wife June in 1924.
34 | He died in Pacific Palisades, California on June 7, 1980.
35 | """
36 |
37 |
38 | @pytest.mark.asyncio
39 | async def test_url_crawler_with_mocked_llm(
40 | sample_input_state: dict,
41 | mock_scraped_content: str,
42 | ):
43 | """Unit test for the simplified URL crawler graph."""
44 | # --- Act: Execute the graph with patched dependencies ---
45 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl:
46 | # Configure URL crawling mock
47 | mock_crawl.return_value = mock_scraped_content
48 |
49 | result = await url_crawler_app.ainvoke(sample_input_state)
50 |
51 | # --- Assert: Verify the output ---
52 | assert "extracted_events" in result
53 | assert "raw_scraped_content" in result
54 |
55 | extracted_events = result["extracted_events"]
56 | raw_scraped_content = result["raw_scraped_content"]
57 |
58 | # Verify that the scraped content is returned
59 | assert extracted_events == mock_scraped_content
60 | assert raw_scraped_content == mock_scraped_content
61 |
62 | # Verify that url_crawl was called with the correct URL
63 | mock_crawl.assert_called_once_with(sample_input_state["url"])
64 |
65 |
66 | @pytest.mark.asyncio
67 | async def test_url_crawler_with_mocked_url_crawling(
68 | sample_input_state: dict,
69 | mock_scraped_content: str,
70 | ):
71 | """Test URL crawler with mocked URL crawling."""
72 | # --- Act: Execute with mocked URL crawling ---
73 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl:
74 | # Configure URL crawling mock
75 | mock_crawl.return_value = mock_scraped_content
76 |
77 | result = await url_crawler_app.ainvoke(sample_input_state)
78 |
79 | # --- Assert: Verify the output ---
80 | assert "extracted_events" in result
81 | assert "raw_scraped_content" in result
82 |
83 | extracted_events = result["extracted_events"]
84 | raw_scraped_content = result["raw_scraped_content"]
85 |
86 | # Verify that the scraped content is returned correctly
87 | assert extracted_events == mock_scraped_content
88 | assert raw_scraped_content == mock_scraped_content
89 |
90 | # Verify that url_crawl was called with the correct URL
91 | mock_crawl.assert_called_once_with(sample_input_state["url"])
92 |
93 |
94 | @pytest.mark.asyncio
95 | async def test_url_crawler_with_empty_content():
96 | """Test URL crawler with empty scraped content."""
97 | input_state = {
98 | "url": "https://example.com/empty",
99 | "research_question": "Test question",
100 | }
101 |
102 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl:
103 | # Configure URL crawling mock to return empty content
104 | mock_crawl.return_value = ""
105 |
106 | result = await url_crawler_app.ainvoke(input_state)
107 |
108 | # --- Assert: Verify the output ---
109 | assert "extracted_events" in result
110 | assert "raw_scraped_content" in result
111 |
112 | # Should return empty content
113 | assert result["extracted_events"] == ""
114 | assert result["raw_scraped_content"] == ""
115 |
116 |
117 | @pytest.mark.asyncio
118 | async def test_url_crawler_with_long_content():
119 | """Test URL crawler with content longer than MAX_CONTENT_LENGTH."""
120 | input_state = {
121 | "url": "https://example.com/long",
122 | "research_question": "Test question",
123 | }
124 |
125 | # Create content longer than typical MAX_CONTENT_LENGTH
126 | long_content = "This is a very long content. " * 10000 # Much longer than limit
127 |
128 | with patch("url_crawler.url_krawler_graph.url_crawl") as mock_crawl:
129 | # Configure URL crawling mock to return long content
130 | mock_crawl.return_value = long_content
131 |
132 | result = await url_crawler_app.ainvoke(input_state)
133 |
134 | # --- Assert: Verify the output ---
135 | assert "extracted_events" in result
136 | assert "raw_scraped_content" in result
137 |
138 | # Content should be truncated to MAX_CONTENT_LENGTH
139 | returned_content = result["extracted_events"]
140 | assert len(returned_content) <= len(long_content)
141 | assert returned_content == result["raw_scraped_content"]
--------------------------------------------------------------------------------
/src/test/test_merge_events.py:
--------------------------------------------------------------------------------
1 | # tests/research_events/test_merge_events.py
2 |
3 | """Tests for the merge_events_graph."""
4 |
5 | from unittest.mock import AsyncMock, patch
6 |
7 | import pytest
8 | from src.state import CategoriesWithEvents
9 |
10 | # Imports are relative to the src directory (configured in pyproject.toml pythonpath)
11 | from research_events.merge_events import merge_events_graph
12 | from research_events.merge_events.merge_events_graph import merge_events_app
13 |
14 | # ## Refactoring Note: Import the module containing the object to be patched.
15 | # This makes the patch call cleaner and more robust against refactoring.
16 |
17 |
18 | @pytest.fixture
19 | def sample_input_state() -> dict:
20 | """Provide a sample input state for the merge_events_app graph."""
21 | return {
22 | "existing_events": CategoriesWithEvents(
23 | early="Born in 1920 in Paris.",
24 | personal="Married in 1945.",
25 | career="Published first novel in 1950.",
26 | legacy="Won Nobel Prize in 1980.",
27 | ),
28 | "raw_extracted_events": "Born in 1920 in Paris, France. Started writing poetry at age 15. Moved to London in 1942.",
29 | }
30 |
31 |
32 | class MockResponse:
33 | """Mock response class for LLM responses."""
34 |
35 | def __init__(self, content):
36 | """Initialize mock response with content."""
37 | self.content = content
38 |
39 |
40 | @pytest.fixture
41 | def mock_structured_llm():
42 | """Provide a reusable mock for model_for_structured with configurable responses."""
43 |
44 | def create_mock_model(categorized_events, merge_responses):
45 | """Create a configured mock model."""
46 | mock_model = AsyncMock()
47 |
48 | # Mock the structured output chain
49 | mock_structured_llm = AsyncMock()
50 | mock_structured_llm.ainvoke.return_value = categorized_events
51 | mock_model.with_structured_output.return_value = mock_structured_llm
52 |
53 | # Create async functions that return the mock responses
54 | async def mock_ainvoke(prompt):
55 | return merge_responses.pop(0)
56 |
57 | # Mock the regular invoke calls for merging
58 | mock_model.ainvoke = mock_ainvoke
59 |
60 | return mock_model
61 |
62 | return create_mock_model
63 |
64 |
65 | @pytest.mark.skip(reason="Skip mocked LLM test for now")
66 | @pytest.mark.asyncio
67 | async def test_merge_events_with_mocked_llm(
68 | sample_input_state: dict, mock_structured_llm
69 | ):
70 | """Unit test for the merge events graph with a mocked LLM."""
71 | # --- Arrange: Mock Data Setup ---
72 | mock_categorized_events = CategoriesWithEvents(
73 | early="Born in 1920 in Paris, France. Started writing poetry at age 15.",
74 | personal="Moved to London in 1942.",
75 | career="",
76 | legacy="",
77 | )
78 |
79 | mock_merge_responses = [
80 | MockResponse(
81 | "Born in 1920 in Paris, France. Started writing poetry at age 15."
82 | ),
83 | MockResponse("Married in 1945. Moved to London in 1942."),
84 | MockResponse("Published first novel in 1950."),
85 | MockResponse("Won Nobel Prize in 1980."),
86 | ]
87 |
88 | # --- Act: Execute the graph with patched dependencies ---
89 | with patch.object(merge_events_graph, "model_for_structured") as mock_model:
90 | # Configure the mock using the reusable fixture
91 | llm_mock = mock_structured_llm(mock_categorized_events, mock_merge_responses)
92 |
93 | # Apply the mock configuration
94 | mock_model.ainvoke = llm_mock.ainvoke
95 | mock_model.with_structured_output.return_value = (
96 | llm_mock.with_structured_output.return_value
97 | )
98 |
99 | result = await merge_events_app.ainvoke(sample_input_state)
100 |
101 | # --- Assert: Verify the output ---
102 | assert "existing_events" in result
103 | merged_events = result["existing_events"]
104 |
105 | assert isinstance(merged_events, CategoriesWithEvents)
106 | assert (
107 | merged_events.early
108 | == "Born in 1920 in Paris, France. Started writing poetry at age 15."
109 | )
110 | assert merged_events.personal == "Married in 1945. Moved to London in 1942."
111 | assert merged_events.career == "Published first novel in 1950."
112 | assert merged_events.legacy == "Won Nobel Prize in 1980."
113 |
114 |
115 | @pytest.mark.skip(reason="Skip real LLM test for now")
116 | @pytest.mark.llm
117 | @pytest.mark.asyncio
118 | async def test_merge_events_with_real_llm(sample_input_state: dict):
119 | """Integration test for the merge events graph with real LLM calls."""
120 | # --- Act ---
121 | result = await merge_events_app.ainvoke(sample_input_state)
122 |
123 | # --- Assert ---
124 | assert "existing_events" in result
125 | merged = result["existing_events"]
126 | assert isinstance(merged, CategoriesWithEvents)
127 |
128 | all_merged_text = " ".join(vars(merged).values())
129 | # Check that key old and new info is present somewhere
130 | print("merged", merged)
131 | print(f"merged.early: {merged.early}")
132 | print(f"merged.personal: {merged.personal}")
133 | print(f"merged.career: {merged.career}")
134 | print(f"merged.legacy: {merged.legacy}")
135 | assert "1920" in merged.early
136 | assert "Married" in merged.personal
137 | assert "Nobel Prize" in merged.legacy
138 | assert "London" in merged.personal
139 |
--------------------------------------------------------------------------------
/src/state.py:
--------------------------------------------------------------------------------
1 | """Defines the Pydantic models and TypedDicts for the research agent graph.
2 | This file serves as the schema for data structures, agent tools, and state management.
3 | """
4 |
5 | import operator
6 | from typing import Annotated, List, TypedDict
7 |
8 | from langchain_core.messages import MessageLikeRepresentation
9 | from pydantic import BaseModel, Field
10 |
11 | ################################################################################
12 | # Section 1: Core Data Models
13 | # - Defines the structure of the primary research output: the chronological timeline.
14 | ################################################################################
15 |
16 |
17 | class ChronologyDate(BaseModel):
18 | """A structured representation of a date for a chronological event."""
19 |
20 | year: int | None = Field(None, description="The year of the event.")
21 | note: str | None = Field(
22 | None, description="Adds extra information to the date (month, day, range...)."
23 | )
24 |
25 |
26 | class ChronologyEventInput(BaseModel):
27 | """Represents a single event, typically used for initial data extraction before an ID is assigned."""
28 |
29 | name: str = Field(description="A short, title-like name for the event.")
30 | description: str = Field(description="A concise description of the event.")
31 | date: ChronologyDate = Field(..., description="The structured date of the event.")
32 | location: str | None = Field(
33 | None, description="The geographical location where the event occurred."
34 | )
35 |
36 |
37 | class ChronologyEvent(ChronologyEventInput):
38 | """The final, canonical event model with a unique identifier."""
39 |
40 | id: str = Field(
41 | description="The id of the event in lowercase and underscores. Ex: 'word1_word2'"
42 | )
43 |
44 |
45 | class ChronologyInput(BaseModel):
46 | """A list of newly extracted events from a research source."""
47 |
48 | events: list[ChronologyEventInput]
49 |
50 |
51 | class Chronology(BaseModel):
52 | """A complete chronological timeline with finalized (ID'd) events."""
53 |
54 | events: list[ChronologyEvent]
55 |
56 |
57 | class CategoriesWithEvents(BaseModel):
58 | early: str = Field(
59 | default="",
60 | description="Covers childhood, upbringing, family, education, and early influences that shaped the author.",
61 | )
62 | personal: str = Field(
63 | default="",
64 | description="Focuses on relationships, friendships, family life, places of residence, and notable personal traits or beliefs.",
65 | )
66 | career: str = Field(
67 | default="",
68 | description="Details their professional journey: first steps into writing, major publications, collaborations, recurring themes, style, and significant milestones.",
69 | )
70 | legacy: str = Field(
71 | default="",
72 | description="Explains how their work was received, awards or recognition, cultural/literary impact, influence on other authors, and how they are remembered today.",
73 | )
74 |
75 |
76 | ################################################################################
77 | # Section 2: Agent Tools
78 | # - Pydantic models that define the tools available to the LLM agents.
79 | ################################################################################
80 |
81 |
82 | class ResearchEventsTool(BaseModel):
83 | """The query to be used to research events about an historical figure. The query is based on the reflection of the assistant."""
84 |
85 | research_question: str
86 | pass # No arguments needed
87 |
88 |
89 | class FinishResearchTool(BaseModel):
90 | """Concludes the research process.
91 | Call this tool ONLY when you have a comprehensive timeline of the person's life,
92 | including key events like birth, death, major achievements, and significant personal
93 | milestones, and you are confident that no major gaps remain.
94 | """
95 |
96 | pass
97 |
98 |
99 | ################################################################################
100 | # Section 3: Graph State Definitions
101 | # - TypedDicts and models that define the "memory" for the agent graphs.
102 | ################################################################################
103 |
104 |
105 | def override_reducer(current_value, new_value):
106 | """Reducer function that allows a new value to completely replace the old one."""
107 | if isinstance(new_value, dict) and new_value.get("type") == "override":
108 | return new_value.get("value", new_value)
109 | return operator.add(current_value, new_value)
110 |
111 |
112 | # --- Main Supervisor Graph State ---
113 |
114 |
115 | class SupervisorStateInput(TypedDict):
116 | """The initial input to start the main research graph."""
117 |
118 | person_to_research: str
119 | existing_events: CategoriesWithEvents = Field(
120 | default=CategoriesWithEvents(early="", personal="", career="", legacy=""),
121 | description="Covers chronology events of the person to research.",
122 | )
123 | used_domains: list[str] = Field(
124 | default=[],
125 | description="The domains that have been used to extract events.",
126 | )
127 | events_summary: str = Field(
128 | default="",
129 | description="A summary of the events.",
130 | )
131 |
132 |
133 | class SupervisorState(SupervisorStateInput):
134 | """The complete state for the main supervisor graph."""
135 |
136 | final_events: List[ChronologyEvent]
137 | conversation_history: Annotated[list[MessageLikeRepresentation], override_reducer]
138 | iteration_count: int = 0
139 | structured_events: list[ChronologyEvent] | None
140 |
--------------------------------------------------------------------------------
/src/research_events/merge_events/fullcategorized.json:
--------------------------------------------------------------------------------
1 | {
2 | "early": "- Henry Miller was born: Henry Valentine Miller was born on December 26 in New York City. (1891, New York City)\n- Family moved to Brooklyn: Miller's family moved to 1063 Decatur Street in Brooklyn's Bushwick neighborhood. (1900, Brooklyn, New York City)\n- Henry Miller was active with the Socialist Party of America: Miller was active with the Socialist Party of America. (1900 circa, New York)\n- Attended Eastern District High School: Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school. (early 1900s, Williamsburg, Brooklyn)\n- Attended City College of New York: Miller attended the City College of New York for one semester. (early 1910s, New York City)\n- Henry Miller married Beatrice Sylvas Wickens: Miller married Beatrice Sylvas Wickens, an amateur pianist. (1917, New York)\n- Daughter Barbara born: Miller and Beatrice had a daughter named Barbara. (1919, New York)\n- Worked at Western Union: Miller worked at Western Union as personnel manager in the messenger department. (1920-1924, New York City)\n- Wrote first novel Clipped Wings: Miller wrote his first novel, 'Clipped Wings,' during a three-week vacation in March (unpublished, only fragments remain). (1922 March, New York City)\n- Divorced Beatrice Sylvas Wickens: Miller was divorced from Beatrice Sylvas Wickens. (1923 December 21, New York)\n- Met June Mansfield: Miller met and became enamored of June Mansfield (born Juliet Edith Smerth) at a dance hall. (1923, New York)\n- Married June Mansfield: Miller married June Mansfield. (1924 June 1, New York)\n- Quit Western Union to write: Miller quit his job at Western Union to dedicate himself completely to writing. (1924, New York)",
3 | "personal": "- Spent months in Paris with June: Miller spent several months in Paris with June, trip financed by Roland Freedman. (1928, Paris, France)\n- Moved to Paris alone: Miller moved to Paris unaccompanied. (1930, Paris, France)\n- Divorced by June in Mexico: June divorced Miller by proxy. (1934, Mexico City, Mexico)\n- Visited Greece: Miller visited Greece, invited by Lawrence Durrell who was living in Corfu. (1939, Greece)\n- Returned to New York: Miller returned to New York. (1940, New York City)\n- Moved to California: Miller moved to California in June, initially residing just outside Hollywood in Beverly Glen. (1942 June, California)\n- Settled in Big Sur: Miller settled in Big Sur. (1944, Big Sur, California)\n- Married Janina Martha Lepska: Miller married Janina Martha Lepska. (1944, United States)\n- Lived in Big Sur with bohemian writers: Miller continued living in Big Sur with other bohemian writers. (from 1947, Big Sur, California)\n- Divorced Janina Martha Lepska: Miller was divorced from Janina Martha Lepska. (1952, United States)\n- Married artist Eve McClure: Miller married artist Eve McClure. (1953, United States)\n- Divorced Eve McClure: Miller was divorced from Eve McClure. (1960, United States)\n- Reunion with June in New York: Miller arranged a reunion with ex-wife June in New York. (1961, New York)\n- Moved to Pacific Palisades: Miller moved to 444 Ocampo Drive, Pacific Palisades, Los Angeles. (1963, Pacific Palisades, Los Angeles, California)\n- Married Hiroko Tokuda: Miller married Hiroko Tokuda. (1967, United States)\n- Divorced Hiroko Tokuda: Miller was divorced from Hiroko Tokuda. (1977, United States)\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)",
4 | "career": "- Wrote Moloch: Miller wrote 'Moloch: or, This Gentile World,' initially under the guise of a novel by June Mansfield (unpublished until 1992). (1927-1928, New York)\n- Proofreader for Chicago Tribune Paris: Miller was employed by the Chicago Tribune Paris edition as a proofreader. (1931, Paris, France)\n- Published Tropic of Cancer: Miller's first published book, 'Tropic of Cancer,' was published by Obelisk Press (banned in the United States). (1934, Paris, France)\n- Published Black Spring: Miller published 'Black Spring' (banned in the United States). (1936, Paris, France)\n- Published Tropic of Capricorn: Miller published 'Tropic of Capricorn' (banned in the United States). (1939, Paris, France)\n- Published The Colossus of Maroussi: Miller described his visit to Greece in 'The Colossus of Maroussi'. (1941, United States)\n- Began writing Sexus: Miller began writing 'Sexus,' the first novel in 'The Rosy Crucifixion' trilogy. (1942, California)\n- Published Sunday After the War: Miller published 'Sunday After the War'. (1944, United States)\n- Published The Air-Conditioned Nightmare: Miller published 'The Air-Conditioned Nightmare'. (1945, United States)\n- Published Big Sur and the Oranges of Hieronymus Bosch: Miller published 'Big Sur and the Oranges of Hieronymus Bosch'. (1957, United States)\n- Completed The Rosy Crucifixion trilogy: Miller completed 'The Rosy Crucifixion' trilogy (initially banned in the U.S., published in France and Japan). (1959, United States)\n- Tropic of Cancer published in the US: 'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials. (1961, United States)\n- Published On Turning Eighty: Miller published 'On Turning Eighty,' a chapbook with 200 copies. (1972, United States)\n- Nominated for Nobel Prize: Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip. (1973, )",
5 | "legacy": "- Published The Colossus of Maroussi: Miller described his visit to Greece in 'The Colossus of Maroussi'. (1941, United States)\n- Published Big Sur and the Oranges of Hieronymus Bosch: Miller published 'Big Sur and the Oranges of Hieronymus Bosch'. (1957, United States)\n- Tropic of Cancer published in the US: 'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials. (1961, United States)\n- Nominated for Nobel Prize: Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip. (1973, )\n- Henry Miller died: Henry Miller died of circulatory complications at home. (1980 June 7, Pacific Palisades, Los Angeles)"
6 | }
7 |
--------------------------------------------------------------------------------
/src/research_events/result.json:
--------------------------------------------------------------------------------
1 | {
2 | "existing_events": {
3 | "early": "- Henry Valentine Miller was born in New York City on December 26, 1891.\n- He lived at 450 East 85th Street in Manhattan during his early years.\n- His family moved to Williamsburg, Brooklyn when he was around nine years old, and later to Bushwick.\n- Miller attended Eastern District High School in Williamsburg.\n- He briefly studied at the City College of New York.\n- Miller became active with the Socialist Party of America.\n- He admired Hubert Harrison.",
4 | "personal": "- Miller married Beatrice Sylvas Wickens in 1917 and divorced her in 1923. They had a daughter, Barbara.\n- Miller met June Mansfield around 1924 and they married on June 1, 1924.\n- Miller lived with Kronski at some point between 1926-1927.\n- Miller moved to Paris in 1930. He spent several months there with June in 1938. During his ten-year stay in Paris, Miller became fluent in French.\n- Miller returned to New York in 1940 and moved to California in 1942, initially residing just outside Hollywood in Beverly Glen before settling in Big Sur in 1944.\n- Miller married Janina Martha Lepska in 1944 and had two children with her. They divorced in 1952.\n- Miller married Eve McClure in 1953 but they divorced in 1960.\n- Miller married Hiroko Tokuda in 1967 but they divorced in 1977.",
5 | "career": "- Miller quit Western Union to dedicate himself to writing in 1924.\n- He was supported financially by Roland Freedman who paid June Mansfield to write a novel, pretending it was her work and reviewing Miller's writing weekly.\n- Miller moved to Paris unaccompanied in 1930.\n- He was employed as a proofreader for the Chicago Tribune Paris edition in 1931 thanks to Alfred Perlès.\n- This period marked a creative time for Miller, and he began building a network of authors around Villa Seurat.\n- Lawrence Durrell became a lifelong friend.\n- Anaïs Nin and Hugh Guiler financially supported Miller between 1931-1934, covering his living expenses including rent at 18 Villa Seurat.\n- Nin became his lover and financed the first printing of Tropic of Cancer in 1934 with money from Otto Rank.",
6 | "legacy": "- Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip in 1973.\n- Miller participated in the filming of Reds in the late 1970s.\n- Miller held an ongoing correspondence of over 1,500 letters with Brenda Venus between 1978 and 1981.\n- Miller died on June 7, 1980 at his home in Pacific Palisades, Los Angeles, aged 88.\n- The Henry Miller Memorial Library was founded in Big Sur in 1981 by Emil White."
7 | },
8 | "final_events": {
9 | "early": "- Henry Valentine Miller was born in New York City on December 26, 1891.\n- He lived at 450 East 85th Street in Manhattan during his early years.\n- His family moved to Williamsburg, Brooklyn when he was around nine years old, and later to Bushwick.\n- Miller attended Eastern District High School in Williamsburg.\n- He briefly studied at the City College of New York.\n- Miller became active with the Socialist Party of America.\n- He admired Hubert Harrison.\n- He was brought up in Brooklyn.",
10 | "personal": "- Miller married Beatrice Sylvas Wickens in 1917 and divorced her in 1923. They had a daughter, Barbara.\n- Miller met June Mansfield around 1924 and they married on June 1, 1924.\n- Miller left his job with Western Union in New York to devote himself to writing in 1924.\n- Miller lived with Kronski at some point between 1926-1927.\n- In 1930, Miller moved to Paris and visited Greece in 1939. During his ten-year stay in France, he became fluent in French.\n- Miller toured the United States extensively between 1940-41 before settling in Big Sur, California.\n- Miller returned to New York in 1940 and moved to California in 1942, initially residing just outside Hollywood in Beverly Glen before settling in Big Sur in 1944.\n- Miller married Janina Martha Lepska in 1944 and had two children with her. They divorced in 1952.\n- Miller married Eve McClure in 1953 but they divorced in 1960.\n- Miller married Hiroko Tokuda in 1967 but they divorced in 1977.",
11 | "career": "- Miller quit Western Union to dedicate himself to writing in 1924.\n- He was supported financially by Roland Freedman who paid June Mansfield to write a novel, pretending it was her work and reviewing Miller's writing weekly.\n- Miller moved to Paris unaccompanied in 1930.\n- He was employed as a proofreader for the Chicago Tribune Paris edition in 1931 thanks to Alfred Perlès.\n- This period marked a creative time for Miller, and he began building a network of authors around Villa Seurat.\n- Lawrence Durrell became a lifelong friend.\n- Anaïs Nin and Hugh Guiler financially supported Miller between 1931-1934, covering his living expenses including rent at 18 Villa Seurat.\n- Nin became his lover and financed the first printing of Tropic of Cancer in 1934 with money from Otto Rank.\n- Miller's career as a writer began when he left his job at Western Union to focus on writing full-time in 1924.\n- He spent significant time abroad for his work, first in France (1930), later Greece (1939).\n- His travels also included an extensive tour of the United States between 1940-41.",
12 | "legacy": "- Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip in 1973.\n- Miller participated in the filming of Reds in the late 1970s.\n- Miller held an ongoing correspondence of over 1,500 letters with Brenda Venus between 1978 and 1981.\n- Miller died on June 7, 1980 at his home in Pacific Palisades, Los Angeles, aged 88. He is remembered for his significant contributions to literature and his influence on other authors.\n- The Henry Miller Memorial Library was founded in Big Sur in 1981 by Emil White."
13 | },
14 | "raw_extracted_events": "Here's a consolidated and chronological list of biographical events for Henry Miller, based on the provided text:\n\n* **December 26, 1891:** Henry Miller was born in New York City.\n* **Childhood:** Miller was brought up in Brooklyn.\n* **1924:** Miller left his job with Western Union in New York to devote himself to writing.\n* **1930:** Miller went to France.\n* **1939:** Miller visited Greece.\n* **1940–41:** Miller toured the United States extensively.\n* **Later Years:** Miller settled in Big Sur, California, becoming the center of a colony of admirers.\n* **June 7, 1980:** Henry Miller died in Pacific Palisades, California, at the age of 88."
15 | }
16 |
--------------------------------------------------------------------------------
/src/test/test_research_events.py:
--------------------------------------------------------------------------------
1 | # tests/research_events/test_research_events.py
2 |
3 | """Tests for the research_events_graph."""
4 |
5 | from unittest.mock import AsyncMock, patch
6 |
7 | import pytest
8 | from src.state import CategoriesWithEvents
9 |
10 | # Imports are relative to the src directory (configured in pyproject.toml pythonpath)
11 | from research_events.research_events_graph import research_events_app
12 |
13 |
14 | @pytest.fixture
15 | def sample_input_state() -> dict:
16 | """Provide a sample input state for the research_events_app graph."""
17 | return {
18 | "research_question": "Research the life of Henry Miller",
19 | "existing_events": CategoriesWithEvents(
20 | early="Born in 1920 in Paris.",
21 | personal="Married in 1945.",
22 | career="Published first novel in 1950.",
23 | legacy="Won Nobel Prize in 1980.",
24 | ),
25 | "used_domains": [],
26 | }
27 |
28 |
29 | class MockResponse:
30 | """Mock response class for LLM responses."""
31 |
32 | def __init__(self, content):
33 | """Initialize mock response with content."""
34 | self.content = content
35 |
36 |
37 | class MockToolCall:
38 | """Mock tool call for structured LLM responses."""
39 |
40 | def __init__(self, name, args):
41 | """Initialize mock tool call with name and args."""
42 | self.name = name
43 | self.args = args
44 |
45 |
46 | class MockToolResponse:
47 | """Mock tool response for structured LLM responses."""
48 |
49 | def __init__(self, tool_calls=None):
50 | """Initialize mock tool response with tool calls."""
51 | self.tool_calls = tool_calls or []
52 |
53 |
54 | @pytest.fixture
55 | def mock_url_crawler():
56 | """Provide a reusable mock for url_crawler_app with configurable responses."""
57 |
58 | def create_mock_crawler(extracted_events):
59 | """Create a configured mock crawler."""
60 | mock_crawler = AsyncMock()
61 | mock_crawler.ainvoke.return_value = {
62 | "extracted_events": extracted_events,
63 | "raw_scraped_content": "Mock scraped content",
64 | }
65 | return mock_crawler
66 |
67 | return create_mock_crawler
68 |
69 |
70 | @pytest.fixture
71 | def mock_merge_events():
72 | """Provide a reusable mock for merge_events_app with configurable responses."""
73 |
74 | def create_mock_merger(existing_events):
75 | """Create a configured mock merger."""
76 | mock_merger = AsyncMock()
77 | mock_merger.ainvoke.return_value = {"existing_events": existing_events}
78 | return mock_merger
79 |
80 | return create_mock_merger
81 |
82 |
83 | # @pytest.mark.skip(reason="Skip mocked LLM test for now")
84 | @pytest.mark.asyncio
85 | async def test_research_events_with_mocked_llm(
86 | sample_input_state: dict, mock_url_crawler, mock_merge_events
87 | ):
88 | """Unit test for the research events graph with mocked dependencies."""
89 | # --- Arrange: Mock Data Setup ---
90 | mock_extracted_events = "Born in 1920 in Paris, France. Started writing poetry at age 15. Moved to London in 1942."
91 | mock_existing_events = CategoriesWithEvents(
92 | early="Born in 1920 in Paris, France. Started writing poetry at age 15.",
93 | personal="Married in 1945. Moved to London in 1942.",
94 | career="Published first novel in 1950.",
95 | legacy="Won Nobel Prize in 1980.",
96 | )
97 |
98 | # --- Act: Execute the graph with patched dependencies ---
99 | with (
100 | patch(
101 | "research_events.research_events_graph.url_crawler_app"
102 | ) as mock_crawler_patch,
103 | patch(
104 | "research_events.research_events_graph.merge_events_app"
105 | ) as mock_merger_patch,
106 | patch("research_events.research_events_graph.TavilySearch") as mock_tavily,
107 | patch("research_events.research_events_graph.create_structured_model") as mock_llm,
108 | ):
109 | # Configure the mocks
110 | mock_crawler_patch.ainvoke = mock_url_crawler(mock_extracted_events).ainvoke
111 | mock_merger_patch.ainvoke = mock_merge_events(mock_existing_events).ainvoke
112 |
113 | # Mock TavilySearch to return empty results (no URLs found)
114 | from unittest.mock import Mock
115 |
116 | mock_tavily_instance = Mock()
117 | mock_tavily_instance.invoke.return_value = {"results": []}
118 | mock_tavily.return_value = mock_tavily_instance
119 |
120 | # Mock the structured LLM to return a test URL
121 | mock_llm_instance = Mock()
122 | mock_llm_instance.invoke.return_value = Mock(selected_urls=["https://example.com/test"])
123 | mock_llm.return_value = mock_llm_instance
124 |
125 | result = await research_events_app.ainvoke(sample_input_state)
126 |
127 | # --- Assert: Verify the output ---
128 | # The result structure should have existing_events and used_domains
129 | assert "existing_events" in result
130 | assert "used_domains" in result
131 |
132 | existing_events = result["existing_events"]
133 | used_domains = result["used_domains"]
134 |
135 | assert isinstance(existing_events, CategoriesWithEvents)
136 | assert (
137 | existing_events.early
138 | == "Born in 1920 in Paris, France. Started writing poetry at age 15."
139 | )
140 | assert existing_events.personal == "Married in 1945. Moved to London in 1942."
141 | assert existing_events.career == "Published first novel in 1950."
142 | assert existing_events.legacy == "Won Nobel Prize in 1980."
143 |
144 | # Verify that domains were tracked
145 | assert isinstance(used_domains, list)
146 |
147 |
148 | # @pytest.mark.skip(reason="Skip real LLM test for now")
149 | @pytest.mark.llm
150 | @pytest.mark.asyncio
151 | async def test_research_events_with_real_llm(sample_input_state: dict):
152 | """Integration test for the research events graph with real LLM calls."""
153 | # --- Act ---
154 | result = await research_events_app.ainvoke(sample_input_state)
155 |
156 | # --- Assert ---
157 | # The result structure should have existing_events and used_domains
158 | assert "existing_events" in result
159 | assert "used_domains" in result
160 |
161 | existing_events = result["existing_events"]
162 | used_domains = result["used_domains"]
163 | assert isinstance(existing_events, CategoriesWithEvents)
164 |
165 | # Check that key information is present somewhere in the final events
166 | all_merged_text = " ".join(vars(existing_events).values())
167 |
168 | # Verify that some content was extracted and merged
169 | assert len(all_merged_text) > 0
170 | # The final events should contain some information from the existing events
171 | assert (
172 | "1920" in all_merged_text
173 | or "Married" in all_merged_text
174 | or "Nobel Prize" in all_merged_text
175 | )
176 |
177 | # Verify that domains were tracked
178 | assert isinstance(used_domains, list)
179 |
--------------------------------------------------------------------------------
/src/research_events/research_events_graph.py:
--------------------------------------------------------------------------------
1 | from typing import Literal, TypedDict
2 |
3 | from langchain_tavily import TavilySearch
4 | from langgraph.graph import END, START, StateGraph
5 | from langgraph.graph.state import RunnableConfig
6 | from langgraph.types import Command
7 | from pydantic import BaseModel, Field
8 | from src.configuration import Configuration
9 | from src.llm_service import create_llm_structured_model
10 | from src.research_events.merge_events.merge_events_graph import merge_events_app
11 | from src.services.url_service import URLService
12 | from src.state import CategoriesWithEvents
13 | from src.url_crawler.url_krawler_graph import url_crawler_app
14 | from src.utils import get_langfuse_handler
15 |
16 |
17 | class InputResearchEventsState(TypedDict):
18 | research_question: str
19 | existing_events: CategoriesWithEvents
20 | used_domains: list[str]
21 |
22 |
23 | class ResearchEventsState(InputResearchEventsState):
24 | urls: list[str]
25 | # Add this temporary field
26 | extracted_events: str
27 |
28 |
29 | class OutputResearchEventsState(TypedDict):
30 | existing_events: CategoriesWithEvents
31 | used_domains: list[str]
32 |
33 |
34 | class BestUrls(BaseModel):
35 | selected_urls: list[str] = Field(description="A list of the two best URLs.")
36 |
37 |
38 | def url_finder(
39 | state: ResearchEventsState,
40 | config: RunnableConfig,
41 | ) -> Command[Literal["should_process_url_router"]]:
42 | """Find the urls for the research_question"""
43 | research_question = state.get("research_question", "")
44 | used_domains = state.get("used_domains", [])
45 |
46 | if not research_question:
47 | raise ValueError("research_question is required")
48 |
49 | tool = TavilySearch(
50 | max_results=6,
51 | topic="general",
52 | include_raw_content=False,
53 | include_answer=False,
54 | exclude_domains=used_domains,
55 | )
56 |
57 | result = tool.invoke({"query": research_question})
58 |
59 | urls = [result["url"] for result in result["results"]]
60 |
61 | prompt = """
62 | From the results below, select the two URLs that will provide the most bibliographical events
63 | (key life events, publications, historical records, detailed timelines) about
64 | the subject's life in relation to the research question.
65 |
66 |
67 | {results}
68 |
69 |
70 |
71 | {research_question}
72 |
73 |
74 | """
75 |
76 | prompt = prompt.format(results=urls, research_question=research_question)
77 |
78 | structured_llm = create_llm_structured_model(config=config, class_name=BestUrls)
79 |
80 | structured_result = structured_llm.invoke(prompt)
81 |
82 | # return Command(
83 | # goto=END,
84 | # update={
85 | # "existing_events": CategoriesWithEvents(
86 | # early="test", personal="test", career="test", legacy="test"
87 | # ),
88 | # "used_domains": ["en.wikipedia.org", "www.britannica.com"],
89 | # },
90 | # )
91 |
92 | ### call to tavily/duck duck go
93 | # urls = model.invoke(research_question)
94 | # urls = [
95 | # "https://en.wikipedia.org/wiki/Henry_Miller",
96 | # "https://www.britannica.com/biography/Henry-Miller",
97 | # ]
98 |
99 | return Command(
100 | goto="should_process_url_router",
101 | update={"urls": structured_result.selected_urls},
102 | )
103 |
104 |
105 | def updateUrlList(
106 | state: ResearchEventsState,
107 | ) -> tuple[list[str], list[str]]:
108 | urls = state.get("urls", [])
109 | used_domains = state.get("used_domains", [])
110 |
111 | return URLService.update_url_list(urls, used_domains)
112 |
113 |
114 | def should_process_url_router(
115 | state: ResearchEventsState,
116 | ) -> Command[Literal["crawl_url", "__end__"]]:
117 | urls = state.get("urls", [])
118 | used_domains = state.get("used_domains", [])
119 |
120 | if urls and len(urls) > 0:
121 | domain = URLService.extract_domain(urls[0])
122 | if domain in used_domains:
123 | # remove first url
124 | remaining_urls = urls[1:]
125 | return Command(
126 | goto="should_process_url_router",
127 | update={"urls": remaining_urls, "used_domains": used_domains},
128 | )
129 |
130 | print(f"URLs remaining: {len(state['urls'])}. Routing to crawl.")
131 | return Command(goto="crawl_url")
132 | else:
133 | print("No URLs remaining. Routing to __end__.")
134 | # Otherwise, end the graph execution
135 | return Command(
136 | goto=END,
137 | )
138 |
139 |
140 | async def crawl_url(
141 | state: ResearchEventsState,
142 | ) -> Command[Literal["merge_events_and_update"]]:
143 | """Crawls the next URL and updates the temporary state with new events."""
144 | urls = state["urls"]
145 | url_to_process = urls[0] # Always process the first one
146 | research_question = state.get("research_question", "")
147 |
148 | if not research_question:
149 | raise ValueError("research_question is required for url crawling")
150 |
151 | # Invoke the crawler subgraph
152 | result = await url_crawler_app.ainvoke(
153 | {"url": url_to_process, "research_question": research_question}
154 | )
155 | extracted_events = result["extracted_events"]
156 | # Go to the merge node, updating the state with the extracted events
157 | return Command(
158 | goto="merge_events_and_update",
159 | update={"extracted_events": extracted_events},
160 | )
161 |
162 |
163 | async def merge_events_and_update(
164 | state: ResearchEventsState,
165 | ) -> Command[Literal["should_process_url_router"]]:
166 | """Merges new events, removes the processed URL, and loops back to the router."""
167 | existing_events = state.get("existing_events", CategoriesWithEvents())
168 | extracted_events = state.get("extracted_events", "")
169 | research_question = state.get("research_question", "")
170 |
171 | # Invoke the merge subgraph
172 | result = await merge_events_app.ainvoke(
173 | {
174 | "existing_events": existing_events,
175 | "extracted_events": extracted_events,
176 | "research_question": research_question,
177 | }
178 | )
179 |
180 | remaining_urls, used_domains = updateUrlList(state)
181 |
182 | # Remaining URLs after removal
183 | return Command(
184 | goto="should_process_url_router",
185 | update={
186 | "existing_events": result["existing_events"],
187 | "urls": remaining_urls,
188 | "used_domains": used_domains,
189 | # "extracted_events": "", # Clear the temporary state
190 | },
191 | )
192 |
193 |
194 | research_events_builder = StateGraph(
195 | ResearchEventsState,
196 | input_schema=InputResearchEventsState,
197 | output_schema=OutputResearchEventsState,
198 | config_schema=Configuration,
199 | )
200 |
201 | # Add all the nodes to the graph
202 | research_events_builder.add_node("url_finder", url_finder)
203 | research_events_builder.add_node("should_process_url_router", should_process_url_router)
204 | research_events_builder.add_node("crawl_url", crawl_url)
205 | research_events_builder.add_node("merge_events_and_update", merge_events_and_update)
206 |
207 | # Set the entry point
208 | research_events_builder.add_edge(START, "url_finder")
209 |
210 |
211 | research_events_app = research_events_builder.compile().with_config(
212 | {"callbacks": [get_langfuse_handler()]}
213 | )
214 |
--------------------------------------------------------------------------------
/src/graph.py:
--------------------------------------------------------------------------------
1 | from typing import Literal
2 |
3 | from langchain_core.messages import (
4 | HumanMessage,
5 | SystemMessage,
6 | ToolMessage,
7 | )
8 | from langchain_core.runnables import RunnableConfig
9 | from langgraph.graph import START, StateGraph
10 | from langgraph.types import Command
11 | from src.configuration import Configuration
12 | from src.llm_service import (
13 | create_llm_structured_model,
14 | create_llm_with_tools,
15 | )
16 | from src.prompts import (
17 | events_summarizer_prompt,
18 | lead_researcher_prompt,
19 | structure_events_prompt,
20 | )
21 | from src.research_events.research_events_graph import research_events_app
22 | from src.state import (
23 | CategoriesWithEvents,
24 | Chronology,
25 | FinishResearchTool,
26 | ResearchEventsTool,
27 | SupervisorState,
28 | SupervisorStateInput,
29 | )
30 | from src.utils import get_buffer_string_with_tools, get_langfuse_handler, think_tool
31 |
32 | config = Configuration()
33 | MAX_TOOL_CALL_ITERATIONS = config.max_tool_iterations
34 |
35 |
36 | # Verify connection
37 | # if langfuse.auth_check():
38 | # print("Langfuse client is authenticated and ready!")
39 | # else:
40 | # print("Authentication failed. Please check your credentials and host.")
41 |
42 |
43 | async def supervisor_node(
44 | state: SupervisorState,
45 | config: RunnableConfig,
46 | ) -> Command[Literal["supervisor_tools"]]:
47 | """The 'brain' of the agent. It decides the next action."""
48 | tools = [
49 | ResearchEventsTool,
50 | FinishResearchTool,
51 | think_tool,
52 | ]
53 |
54 | tools_model = create_llm_with_tools(tools=tools, config=config)
55 | messages = state.get("conversation_history", "")
56 | messages_summary = get_buffer_string_with_tools(messages)
57 | last_message = ""
58 | if len(messages_summary) > 0:
59 | last_message = messages[-1]
60 | system_message = SystemMessage(
61 | content=lead_researcher_prompt.format(
62 | person_to_research=state["person_to_research"],
63 | events_summary=state.get("events_summary", "Everything is missing"),
64 | last_message=last_message,
65 | max_iterations=5,
66 | )
67 | )
68 |
69 | human_message = HumanMessage(content="Start the research process.")
70 | prompt = [system_message, human_message]
71 |
72 | response = await tools_model.ainvoke(prompt)
73 |
74 | # The output is an AIMessage with tool_calls, which we add to the history
75 | return Command(
76 | goto="supervisor_tools",
77 | update={
78 | "conversation_history": [response],
79 | "iteration_count": state.get("iteration_count", 0) + 1,
80 | },
81 | )
82 |
83 |
84 | async def supervisor_tools_node(
85 | state: SupervisorState,
86 | config: RunnableConfig,
87 | ) -> Command[Literal["supervisor", "structure_events"]]:
88 | """The 'hands' of the agent. Executes tools and returns a Command for routing."""
89 | existing_events = state.get(
90 | "existing_events",
91 | CategoriesWithEvents(early="", personal="", career="", legacy=""),
92 | )
93 | events_summary = state.get("events_summary", "")
94 | used_domains = state.get("used_domains", [])
95 | last_message = state["conversation_history"][-1]
96 | iteration_count = state.get("iteration_count", 0)
97 | exceeded_allowed_iterations = iteration_count >= MAX_TOOL_CALL_ITERATIONS
98 |
99 | # If the LLM made no tool calls, we finish.
100 | if not last_message.tool_calls or exceeded_allowed_iterations:
101 | return Command(goto="structure_events")
102 |
103 | # This is the core logic for executing tools and updating state.
104 | all_tool_messages = []
105 |
106 | for tool_call in last_message.tool_calls:
107 | tool_name = tool_call["name"]
108 | tool_args = tool_call["args"]
109 |
110 | if tool_name == "FinishResearchTool":
111 | return Command(goto="structure_events")
112 |
113 | elif tool_name == "think_tool":
114 | # The 'think' tool is special: it just records a reflection.
115 | # The reflection will be in the message history for the *next* supervisor turn.
116 | response_content = tool_args["reflection"]
117 | all_tool_messages.append(
118 | ToolMessage(
119 | content=response_content,
120 | tool_call_id=tool_call["id"],
121 | name=tool_name,
122 | )
123 | )
124 |
125 | elif tool_name == "ResearchEventsTool":
126 | research_question = tool_args["research_question"]
127 | result = await research_events_app.ainvoke(
128 | {
129 | "research_question": research_question,
130 | "existing_events": existing_events,
131 | "used_domains": used_domains,
132 | }
133 | )
134 | existing_events = result["existing_events"]
135 | used_domains = result["used_domains"]
136 |
137 | summarizer_prompt = events_summarizer_prompt.format(
138 | existing_events=existing_events
139 | )
140 | response = await create_llm_structured_model(config=config).ainvoke(
141 | summarizer_prompt
142 | )
143 |
144 | existing_events = existing_events
145 | events_summary = response.content
146 | all_tool_messages.append(
147 | ToolMessage(
148 | content="Called ResearchEventsTool and returned multiple events",
149 | tool_call_id=tool_call["id"],
150 | name=tool_name,
151 | )
152 | )
153 |
154 | # The Command helper tells the graph where to go next and what state to update.
155 | return Command(
156 | goto="supervisor",
157 | update={
158 | "existing_events": existing_events,
159 | "conversation_history": all_tool_messages,
160 | "used_domains": used_domains,
161 | "events_summary": events_summary,
162 | },
163 | )
164 |
165 |
166 | async def structure_events(
167 | state: SupervisorState, config: RunnableConfig
168 | ) -> Command[Literal["__end__"]]:
169 | """Step 2: Structures the cleaned events into JSON format.
170 |
171 | Args:
172 | state: Current researcher state with cleaned events text.
173 | config: Runtime configuration with model settings.
174 |
175 | Returns:
176 | Dictionary containing a list of structured chronology events.
177 | """
178 | print("--- Step 2: Structuring Events into JSON ---")
179 |
180 | # Get the cleaned events from the previous step
181 | existing_events = state.get("existing_events", "")
182 |
183 | if not existing_events:
184 | print("Warning: No cleaned events text found in state")
185 | return {"chronology": []}
186 |
187 | structured_llm = create_llm_structured_model(config=config, class_name=Chronology)
188 |
189 | early_prompt = structure_events_prompt.format(
190 | existing_events=existing_events["early"]
191 | )
192 | career_prompt = structure_events_prompt.format(
193 | existing_events=existing_events["career"]
194 | )
195 | personal_prompt = structure_events_prompt.format(
196 | existing_events=existing_events["personal"]
197 | )
198 | legacy_prompt = structure_events_prompt.format(
199 | existing_events=existing_events["legacy"]
200 | )
201 |
202 | early_response = await structured_llm.ainvoke(early_prompt)
203 | career_response = await structured_llm.ainvoke(career_prompt)
204 | personal_response = await structured_llm.ainvoke(personal_prompt)
205 | legacy_response = await structured_llm.ainvoke(legacy_prompt)
206 | # Invoke the second model to get the final structured output
207 |
208 | all_events = (
209 | early_response.events
210 | + career_response.events
211 | + personal_response.events
212 | + legacy_response.events
213 | )
214 |
215 | return {
216 | "structured_events": all_events,
217 | }
218 |
219 |
220 | workflow = StateGraph(SupervisorState, input_schema=SupervisorStateInput)
221 |
222 | # Add the two core nodes
223 | workflow.add_node("supervisor", supervisor_node)
224 | workflow.add_node("supervisor_tools", supervisor_tools_node)
225 | workflow.add_node("structure_events", structure_events)
226 |
227 | workflow.add_edge(START, "supervisor")
228 |
229 | graph = workflow.compile().with_config({"callbacks": [get_langfuse_handler()]})
230 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [![Contributors][contributors-shield]][contributors-url]
2 | [![Forks][forks-shield]][forks-url]
3 | [![Stargazers][stars-shield]][stars-url]
4 | [![Issues][issues-shield]][issues-url]
5 | [![Unlicense License][license-shield]][license-url]
6 | [![LinkedIn][linkedin-shield]][linkedin-url]
7 |
8 | # Event Deep Research
9 |
10 | AI Agent that researchs the lifes of historical figures and extracts the events into a structured JSON timeline.
11 |
12 |
13 |
14 | ## Table of Contents
15 |
16 | - [Event Deep Research](#event-deep-research)
17 | - [Table of Contents](#table-of-contents)
18 | - [Features](#features)
19 | - [Demo / Example](#demo--example)
20 | - [🚀 Installation](#-installation)
21 | - [Prerequisites](#prerequisites)
22 | - [Setup](#setup)
23 | - [Usage](#usage)
24 | - [Via LangGraph Studio (Recommended)](#via-langgraph-studio-recommended)
25 | - [Configuration (configuration.py)](#configuration-configurationpy)
26 | - [Architecture / Internals](#architecture--internals)
27 | - [Roadmap / Future Work](#roadmap--future-work)
28 | - [Contributing](#contributing)
29 | - [License](#license)
30 | - [Acknowledgments](#acknowledgments)
31 |
32 | ---
33 |
34 | ## Features
35 |
36 | - Supervisor Agent with multiple tools (Research, think, Finish)
37 | - Merge Workflow to incorporate and deduplicate events from multiple sources
38 | - Support for OpenAI, Anthropic, Google, or Local models (Ollama)
39 |
40 | ## Demo / Example
41 |
42 | https://github.com/user-attachments/assets/ebda1625-fdf6-4f3b-a5d2-319d6db40ec2
43 |
44 | **Input:**
45 |
46 | ```json
47 | {
48 | "person_to_research": "Albert Einstein"
49 | }
50 | ```
51 |
52 | **Output:**
53 |
54 | ```json
55 | {
56 | "structured_events": [
57 | {
58 | "name": "Birth in Ulm",
59 | "description": "Albert Einstein was born in Ulm, Germany to Hermann and Pauline Einstein",
60 | "date": {"year": 1879, "note": "March 14"},
61 | "location": "Ulm, German Empire",
62 | "id": "time-1879-03-14T00:00:00Z"
63 | },
64 | {
65 | "name": "Zurich Polytechnic",
66 | "description": "Entered the Swiss Federal Polytechnic School in Zurich to study physics and mathematics",
67 | "date": {"year": 1896, "note": ""},
68 | "location": "Zurich, Switzerland",
69 | "id": "time-1896-01-01T00:00:00Z"
70 | },
71 | {
72 | "name": "Miracle Year Papers",
73 | "description": "Published four groundbreaking papers on photoelectric effect, Brownian motion, special relativity, and mass-energy equivalence",
74 | "date": {"year": 1905, "note": ""},
75 | "location": "Bern, Switzerland",
76 | "id": "time-1905-01-01T00:00:00Z"
77 | },
78 | {
79 | "name": "Nobel Prize in Physics",
80 | "description": "Awarded Nobel Prize for his discovery of the law of the photoelectric effect",
81 | "date": {"year": 1921, "note": ""},
82 | "location": "Stockholm, Sweden",
83 | "id": "time-1921-01-01T00:00:00Z"
84 | },
85 | {
86 | "name": "Death in Princeton",
87 | "description": "Albert Einstein died at Princeton Hospital after refusing surgery for an abdominal aortic aneurysm",
88 | "date": {"year": 1955, "note": "April 18"},
89 | "location": "Princeton, New Jersey, USA",
90 | "id": "time-1955-04-18T00:00:00Z"
91 | }
92 | ]
93 | }
94 | ```
95 |
96 | ## 🚀 Installation
97 |
98 | ### Prerequisites
99 |
100 | - **Python 3.12+**
101 | - **uv** (Python package manager)
102 |
103 | ### Setup
104 |
105 | ```bash
106 | # 1. Clone the repository
107 | git clone https://github.com/bernatsampera/event-deep-research.git
108 | cd event-deep-research
109 |
110 | # 2. Create virtual environment and install dependencies
111 | uv venv && source .venv/bin/activate
112 | uv sync
113 |
114 | # 3. Set up environment variables
115 | cp .env.example .env
116 | # Edit .env with your API keys:
117 | # FIRECRAWL_BASE_URL (https://api.firecrawl.com/v1)
118 | # - FIRECRAWL_API_KEY (required for production, optional for local testing)
119 | # - TAVILY_API_KEY (required)
120 | # - OPENAI_API_KEY, ANTHROPIC_API_KEY, or GOOGLE_API_KEY (Change model in configuration.py)
121 |
122 | # 4. Start the development server
123 | uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.12 langgraph dev --allow-blocking
124 | # Open http://localhost:2024 to access LangGraph Studio
125 | ```
126 |
127 | ## Usage
128 |
129 | ### Via LangGraph Studio (Recommended)
130 |
131 | 1. Start the development server: `uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.12 langgraph dev --allow-blocking`
132 | 2. Open http://localhost:2024
133 | 3. Select the `supervisor` graph
134 | 4. Input your research query:
135 | ```json
136 | {
137 | "person_to_research": "Albert Einstein"
138 | }
139 | ```
140 | 5. Watch the agent work in real-time!
141 |
142 | ## Configuration (configuration.py)
143 |
144 | llm_model: Primary LLM model to use for both structured output and tools
145 |
146 | # Optional overrides to change the models used for different parts of the workflow
147 | structured_llm_model: Override model for structured output
148 | tools_llm_model: Override model for tools
149 | chunk_llm_model: Small model for chunk biographical event detection
150 |
151 | # Maximum tokens for the models
152 | structured_llm_max_tokens: Maximum tokens for structured output model
153 | tools_llm_max_tokens: Maximum tokens for tools model
154 |
155 | # Maximum retry attempts for the models
156 | max_structured_output_retries: Maximum retry attempts for structured output
157 | max_tools_output_retries: Maximum retry attempts for tool calls
158 |
159 | # Values from graph files
160 | default_chunk_size: Default chunk size for text processing
161 | default_overlap_size: Default overlap size between chunks
162 | max_content_length: Maximum content length to process
163 | max_tool_iterations: Maximum number of tool iterations
164 | max_chunks: Maximum number of chunks to process for biographical event detection
165 |
166 | ## Architecture / Internals
167 |
168 | 1. **Supervisor Agent** - Coordinates the entire workflow, decides next steps
169 | 2. **Research Agent** - Finds relevant biographical sources, manages crawler and merge agents
170 | 3. **URL Crawler** - Extracts content from web pages with Firecrawl
171 | 4. **Merge Agent** - Combines and deduplicates events
172 |
173 |
174 |
175 | ## Roadmap / Future Work
176 |
177 | - Add images to relevant events
178 | - Improve speed of merge graph
179 |
180 | ## Contributing
181 |
182 | We welcome contributions! This is a great project to learn:
183 |
184 | 1. **Fork** the repository
185 | 2. **Create** a feature branch: `git checkout -b feature/amazing-feature`
186 | 3. **Commit** your changes: `git commit -m 'Add amazing feature'`
187 | 4. **Push** to the branch: `git push origin feature/amazing-feature`
188 | 5. **Open** a Pull Request
189 |
190 | See the [open issues](https://github.com/bernatsampera/event-deep-research/issues) for a full list of proposed features and known issues.
191 |
192 | ## License
193 |
194 | Distributed under the MIT License. See `LICENSE.txt` for details.
195 |
196 | ## Acknowledgments
197 |
198 | - **[LangChain](https://github.com/langchain-ai/langchain)** - Foundational LLM framework
199 | - **[LangGraph](https://github.com/langchain-ai/langgraph)** - Multi-agent orchestration
200 | - **[Open Deep Research](https://github.com/langchain-ai/open_deep_research)** - Research methodology inspiration
201 | - **[Firecrawl](https://www.firecrawl.com/)** - Web scraping
202 | - **[Tavily](https://tavily.ai/)** - Web search
203 |
204 | [contributors-shield]: https://img.shields.io/github/contributors/bernatsampera/event-deep-research.svg?style=for-the-badge
205 | [contributors-url]: https://github.com/bernatsampera/event-deep-research/graphs/contributors
206 | [forks-shield]: https://img.shields.io/github/forks/bernatsampera/event-deep-research.svg?style=for-the-badge
207 | [forks-url]: https://github.com/bernatsampera/event-deep-research/network/members
208 | [stars-shield]: https://img.shields.io/github/stars/bernatsampera/event-deep-research.svg?style=for-the-badge
209 | [stars-url]: https://github.com/bernatsampera/event-deep-research/stargazers
210 | [issues-shield]: https://img.shields.io/github/issues/bernatsampera/event-deep-research.svg?style=for-the-badge
211 | [issues-url]: https://github.com/bernatsampera/event-deep-research/issues
212 | [license-shield]: https://img.shields.io/github/license/bernatsampera/event-deep-research.svg?style=for-the-badge
213 | [license-url]: https://github.com/bernatsampera/event-deep-research/blob/master/LICENSE.txt
214 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555
215 | [linkedin-url]: https://www.linkedin.com/in/bernat-sampera-195152107/
216 |
--------------------------------------------------------------------------------
/src/research_events/merge_events/merge_events_graph.py:
--------------------------------------------------------------------------------
1 | from typing import List, Literal, TypedDict
2 |
3 | from langchain_core.tools import tool
4 | from langgraph.graph import START, StateGraph
5 | from langgraph.graph.state import Command, RunnableConfig
6 | from langgraph.pregel.main import asyncio
7 | from pydantic import BaseModel, Field
8 | from src.configuration import Configuration
9 | from src.llm_service import create_llm_with_tools
10 | from src.research_events.chunk_graph import create_biographic_event_graph
11 | from src.research_events.merge_events.prompts import (
12 | EXTRACT_AND_CATEGORIZE_PROMPT,
13 | MERGE_EVENTS_TEMPLATE,
14 | )
15 | from src.research_events.merge_events.utils import ensure_categories_with_events
16 | from src.services.event_service import EventService
17 | from src.state import CategoriesWithEvents
18 | from src.url_crawler.utils import chunk_text_by_tokens
19 | from src.utils import get_langfuse_handler
20 |
21 |
22 | class RelevantEventsCategorized(BaseModel):
23 | """The chunk contains relevant biographical events that have been categorized."""
24 |
25 | early: str = Field(
26 | description="Bullet points of events related to childhood, upbringing, family, education, and early influences"
27 | )
28 | personal: str = Field(
29 | description="Bullet points of events related to relationships, friendships, family life, residence, and personal traits"
30 | )
31 | career: str = Field(
32 | description="Bullet points of events related to professional journey, publications, collaborations, and milestones"
33 | )
34 | legacy: str = Field(
35 | description="Bullet points of events related to recognition, impact, influence, and how they are remembered"
36 | )
37 |
38 |
39 | class IrrelevantChunk(BaseModel):
40 | """The chunk contains NO biographical events relevant to the research question."""
41 |
42 |
43 | class InputMergeEventsState(TypedDict):
44 | """The complete state for the enhanced event merging sub-graph."""
45 |
46 | existing_events: CategoriesWithEvents
47 | extracted_events: str
48 | research_question: str
49 |
50 |
51 | class MergeEventsState(InputMergeEventsState):
52 | text_chunks: List[str] # token-based chunks
53 | categorized_chunks: List[CategoriesWithEvents] # results per chunk
54 | extracted_events_categorized: CategoriesWithEvents
55 |
56 |
57 | class OutputMergeEventsState(TypedDict):
58 | existing_events: CategoriesWithEvents # includes the existing events + the events from the new events
59 |
60 |
61 | async def split_events(
62 | state: MergeEventsState,
63 | ) -> Command[Literal["filter_chunks", "__end__"]]:
64 | """Use token-based chunking from URL crawler and filter for biographical events"""
65 | extracted_events = state.get("extracted_events", "")
66 |
67 | if not extracted_events.strip():
68 | # No content to process
69 | return Command(
70 | goto="__end__",
71 | update={"text_chunks": [], "categorized_chunks": []},
72 | )
73 |
74 | chunks = await chunk_text_by_tokens(extracted_events)
75 |
76 | return Command(
77 | goto="filter_chunks",
78 | update={"text_chunks": chunks[0:20], "categorized_chunks": []},
79 | )
80 |
81 |
82 | async def filter_chunks(
83 | state: MergeEventsState, config: RunnableConfig
84 | ) -> Command[Literal["extract_and_categorize_chunk", "__end__"]]:
85 | """Filter chunks to only process those containing biographical events"""
86 | chunks = state.get("text_chunks", [])
87 |
88 | if not chunks:
89 | return Command(
90 | goto="__end__",
91 | )
92 |
93 | # Use chunk graph to filter for biographical events
94 | chunk_graph = create_biographic_event_graph()
95 |
96 | configurable = Configuration.from_runnable_config(config)
97 | if len(chunks) > configurable.max_chunks:
98 | # To avoid recursion issues, set max chunks
99 | chunks = chunks[: configurable.max_chunks]
100 |
101 | # Process each chunk through the biographic event detection graph
102 | relevant_chunks = []
103 | for chunk in chunks:
104 | chunk_result = await chunk_graph.ainvoke({"text": chunk}, config)
105 |
106 | # Check if any chunk contains biographical events
107 | has_events = any(
108 | result.contains_biographic_event
109 | for result in chunk_result["results"].values()
110 | )
111 | print(f"contains_biographic_event: {has_events}")
112 |
113 | if has_events:
114 | relevant_chunks.append(chunk)
115 |
116 | if not relevant_chunks:
117 | # No relevant chunks found
118 | return Command(goto="__end__")
119 |
120 | return Command(
121 | goto="extract_and_categorize_chunk",
122 | update={"text_chunks": chunks, "categorized_chunks": []},
123 | )
124 |
125 |
126 | async def extract_and_categorize_chunk(
127 | state: MergeEventsState, config: RunnableConfig
128 | ) -> Command[Literal["extract_and_categorize_chunk", "merge_categorizations"]]:
129 | """Combined extraction and categorization"""
130 | chunks = state.get("text_chunks", [])
131 | categorized_chunks = state.get("categorized_chunks", [])
132 |
133 | if len(categorized_chunks) >= len(chunks):
134 | # all categorized_chunks done → move to merge
135 | return Command(goto="merge_categorizations")
136 |
137 | # take next chunk
138 | chunk = chunks[len(categorized_chunks)]
139 | research_question = state.get("research_question", "")
140 |
141 | prompt = EXTRACT_AND_CATEGORIZE_PROMPT.format(
142 | # research_question=research_question,
143 | text_chunk=chunk
144 | )
145 |
146 | tools = [tool(RelevantEventsCategorized), tool(IrrelevantChunk)]
147 | model = create_llm_with_tools(tools=tools, config=config)
148 | response = await model.ainvoke(prompt)
149 |
150 | # Parse response
151 | if (
152 | response.tool_calls
153 | and response.tool_calls[0]["name"] == "RelevantEventsCategorized"
154 | ):
155 | categorized_data = response.tool_calls[0]["args"]
156 | # Convert any list values to strings
157 | categorized_data = {
158 | k: "\n".join(v) if isinstance(v, list) else v
159 | for k, v in categorized_data.items()
160 | }
161 | categorized = CategoriesWithEvents(**categorized_data)
162 | else:
163 | categorized = CategoriesWithEvents(early="", personal="", career="", legacy="")
164 |
165 | return Command(
166 | goto="extract_and_categorize_chunk", # loop until all chunks processed
167 | update={"categorized_chunks": categorized_chunks + [categorized]},
168 | )
169 |
170 |
171 | async def merge_categorizations(
172 | state: MergeEventsState,
173 | ) -> Command[Literal["combine_new_and_original_events"]]:
174 | """Merge all categorized chunks into a single CategoriesWithEvents"""
175 | results = state.get("categorized_chunks", [])
176 |
177 | merged = EventService.merge_categorized_events(results)
178 |
179 | return Command(
180 | goto="combine_new_and_original_events",
181 | update={"extracted_events_categorized": merged},
182 | )
183 |
184 |
185 | async def combine_new_and_original_events(
186 | state: MergeEventsState, config: RunnableConfig
187 | ) -> Command:
188 | """Merge original and new events for each category using an LLM."""
189 | print("Combining new and original events...")
190 |
191 | existing_events_raw = state.get(
192 | "existing_events",
193 | CategoriesWithEvents(early="", personal="", career="", legacy=""),
194 | )
195 | new_events_raw = state.get(
196 | "extracted_events_categorized",
197 | CategoriesWithEvents(early="", personal="", career="", legacy=""),
198 | )
199 |
200 | # Convert to proper Pydantic models if they're dicts
201 | existing_events = ensure_categories_with_events(existing_events_raw)
202 | new_events = ensure_categories_with_events(new_events_raw)
203 |
204 | if not new_events or not any(
205 | getattr(new_events, cat, "").strip()
206 | for cat in CategoriesWithEvents.model_fields.keys()
207 | ):
208 | print("No new events found. Keeping existing events.")
209 | return Command(goto="__end__", update={"existing_events": existing_events})
210 |
211 | merge_tasks = []
212 | categories = CategoriesWithEvents.model_fields.keys()
213 |
214 | for category in categories:
215 | # Now you can safely use getattr since they're guaranteed to be Pydantic models
216 | existing_text = getattr(existing_events, category, "").strip()
217 | new_text = getattr(new_events, category, "").strip()
218 |
219 | if not (existing_text or new_text):
220 | continue # nothing to merge in this category
221 |
222 | existing_display = existing_text if existing_text else "No events"
223 | new_display = new_text if new_text else "No events"
224 |
225 | prompt = MERGE_EVENTS_TEMPLATE.format(
226 | original=existing_display, new=new_display
227 | )
228 |
229 | # Use regular structured model for merging (not tools model)
230 | from src.llm_service import create_llm_structured_model
231 |
232 | merge_tasks.append(
233 | (category, create_llm_structured_model(config=config).ainvoke(prompt))
234 | )
235 |
236 | final_merged_dict = {}
237 | if merge_tasks:
238 | categories, tasks = zip(*merge_tasks)
239 | responses = await asyncio.gather(*tasks)
240 | final_merged_dict = {
241 | cat: resp.content for cat, resp in zip(categories, responses)
242 | }
243 |
244 | # Ensure all categories are included
245 | for category in CategoriesWithEvents.model_fields.keys():
246 | if category not in final_merged_dict:
247 | final_merged_dict[category] = getattr(existing_events, category, "")
248 |
249 | final_merged_output = CategoriesWithEvents(**final_merged_dict)
250 | return Command(goto="__end__", update={"existing_events": final_merged_output})
251 |
252 |
253 | merge_events_graph_builder = StateGraph(
254 | MergeEventsState, input_schema=InputMergeEventsState, config_schema=Configuration
255 | )
256 |
257 | merge_events_graph_builder.add_node("split_events", split_events)
258 | merge_events_graph_builder.add_node("filter_chunks", filter_chunks)
259 | merge_events_graph_builder.add_node(
260 | "extract_and_categorize_chunk", extract_and_categorize_chunk
261 | )
262 | merge_events_graph_builder.add_node("merge_categorizations", merge_categorizations)
263 | merge_events_graph_builder.add_node(
264 | "combine_new_and_original_events", combine_new_and_original_events
265 | )
266 |
267 | merge_events_graph_builder.add_edge(START, "split_events")
268 |
269 |
270 | merge_events_app = merge_events_graph_builder.compile().with_config(
271 | {
272 | "callbacks": [get_langfuse_handler()],
273 | "recursionLimit": 200,
274 | },
275 | )
276 |
--------------------------------------------------------------------------------
/src/research_events/merge_events/test.json:
--------------------------------------------------------------------------------
1 | {
2 | "existing_events": [
3 | {
4 | "name": "Henry Miller was born",
5 | "description": "Henry Valentine Miller was born on December 26 in New York City.",
6 | "date": {
7 | "year": 1891,
8 | "note": "December 26"
9 | },
10 | "location": "New York City",
11 | "id": "henry_miller_born"
12 | },
13 | {
14 | "name": "Family moved to Brooklyn",
15 | "description": "Miller's family moved to 1063 Decatur Street in Brooklyn's Bushwick neighborhood.",
16 | "date": {
17 | "year": 1900,
18 | "note": ""
19 | },
20 | "location": "Brooklyn, New York City",
21 | "id": "henry_miller_family_moved_to_brooklyn"
22 | },
23 | {
24 | "name": "Henry Miller was active with the Socialist Party of America",
25 | "description": "Miller was active with the Socialist Party of America.",
26 | "date": {
27 | "year": 1900,
28 | "note": "circa"
29 | },
30 | "location": "New York",
31 | "id": "henry_miller_active_with_the_socialist_party_of_america"
32 | },
33 | {
34 | "name": "Attended Eastern District High School",
35 | "description": "Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school.",
36 | "date": {
37 | "year": 1900,
38 | "note": "early 1900s"
39 | },
40 | "location": "Williamsburg, Brooklyn",
41 | "id": "henry_miller_attended_high_school"
42 | },
43 | {
44 | "name": "Attended City College of New York",
45 | "description": "Miller attended the City College of New York for one semester.",
46 | "date": {
47 | "year": 1910,
48 | "note": "early 1910s"
49 | },
50 | "location": "New York City",
51 | "id": "henry_miller_attended_city_college"
52 | },
53 | {
54 | "name": "Henry Miller married Beatrice Sylvas Wickens",
55 | "description": "Miller married Beatrice Sylvas Wickens, an amateur pianist.",
56 | "date": {
57 | "year": 1917,
58 | "note": ""
59 | },
60 | "location": "New York",
61 | "id": "henry_miller_married_beatrice_sylvas_wickens"
62 | },
63 | {
64 | "name": "Daughter Barbara born",
65 | "description": "Miller and Beatrice had a daughter named Barbara.",
66 | "date": {
67 | "year": 1919,
68 | "note": ""
69 | },
70 | "location": "New York",
71 | "id": "henry_miller_daughter_barbara_born"
72 | },
73 | {
74 | "name": "Worked at Western Union",
75 | "description": "Miller worked at Western Union as personnel manager in the messenger department.",
76 | "date": {
77 | "year": 1920,
78 | "note": "1920-1924"
79 | },
80 | "location": "New York City",
81 | "id": "henry_miller_worked_at_western_union"
82 | },
83 | {
84 | "name": "Wrote first novel Clipped Wings",
85 | "description": "Miller wrote his first novel, 'Clipped Wings,' during a three-week vacation in March (unpublished, only fragments remain).",
86 | "date": {
87 | "year": 1922,
88 | "note": "March"
89 | },
90 | "location": "New York City",
91 | "id": "henry_miller_wrote_clipped_wings"
92 | },
93 | {
94 | "name": "Divorced Beatrice Sylvas Wickens",
95 | "description": "Miller was divorced from Beatrice Sylvas Wickens.",
96 | "date": {
97 | "year": 1923,
98 | "note": "December 21"
99 | },
100 | "location": "New York",
101 | "id": "henry_miller_divorced_beatrice"
102 | },
103 | {
104 | "name": "Met June Mansfield",
105 | "description": "Miller met and became enamored of June Mansfield (born Juliet Edith Smerth) at a dance hall.",
106 | "date": {
107 | "year": 1923,
108 | "note": ""
109 | },
110 | "location": "New York",
111 | "id": "henry_miller_met_june_mansfield"
112 | },
113 | {
114 | "name": "Married June Mansfield",
115 | "description": "Miller married June Mansfield.",
116 | "date": {
117 | "year": 1924,
118 | "note": "June 1"
119 | },
120 | "location": "New York",
121 | "id": "henry_miller_married_june_mansfield"
122 | },
123 | {
124 | "name": "Quit Western Union to write",
125 | "description": "Miller quit his job at Western Union to dedicate himself completely to writing.",
126 | "date": {
127 | "year": 1924,
128 | "note": ""
129 | },
130 | "location": "New York",
131 | "id": "henry_miller_quit_western_union"
132 | },
133 | {
134 | "name": "Wrote Moloch",
135 | "description": "Miller wrote 'Moloch: or, This Gentile World,' initially under the guise of a novel by June Mansfield (unpublished until 1992).",
136 | "date": {
137 | "year": 1927,
138 | "note": "1927-1928"
139 | },
140 | "location": "New York",
141 | "id": "henry_miller_wrote_moloch"
142 | },
143 | {
144 | "name": "Spent months in Paris with June",
145 | "description": "Miller spent several months in Paris with June, trip financed by Roland Freedman.",
146 | "date": {
147 | "year": 1928,
148 | "note": ""
149 | },
150 | "location": "Paris, France",
151 | "id": "henry_miller_paris_with_june_1928"
152 | },
153 | {
154 | "name": "Moved to Paris alone",
155 | "description": "Miller moved to Paris unaccompanied.",
156 | "date": {
157 | "year": 1930,
158 | "note": ""
159 | },
160 | "location": "Paris, France",
161 | "id": "henry_miller_moved_to_paris_alone"
162 | },
163 | {
164 | "name": "Proofreader for Chicago Tribune Paris",
165 | "description": "Miller was employed by the Chicago Tribune Paris edition as a proofreader.",
166 | "date": {
167 | "year": 1931,
168 | "note": ""
169 | },
170 | "location": "Paris, France",
171 | "id": "henry_miller_proofreader_chicago_tribune"
172 | },
173 | {
174 | "name": "Published Tropic of Cancer",
175 | "description": "Miller's first published book, 'Tropic of Cancer,' was published by Obelisk Press (banned in the United States).",
176 | "date": {
177 | "year": 1934,
178 | "note": ""
179 | },
180 | "location": "Paris, France",
181 | "id": "henry_miller_published_tropic_of_cancer"
182 | },
183 | {
184 | "name": "Divorced by June in Mexico",
185 | "description": "June divorced Miller by proxy.",
186 | "date": {
187 | "year": 1934,
188 | "note": ""
189 | },
190 | "location": "Mexico City, Mexico",
191 | "id": "henry_miller_divorced_by_june"
192 | },
193 | {
194 | "name": "Published Black Spring",
195 | "description": "Miller published 'Black Spring' (banned in the United States).",
196 | "date": {
197 | "year": 1936,
198 | "note": ""
199 | },
200 | "location": "Paris, France",
201 | "id": "henry_miller_published_black_spring"
202 | },
203 | {
204 | "name": "Published Tropic of Capricorn",
205 | "description": "Miller published 'Tropic of Capricorn' (banned in the United States).",
206 | "date": {
207 | "year": 1939,
208 | "note": ""
209 | },
210 | "location": "Paris, France",
211 | "id": "henry_miller_published_tropic_of_capricorn"
212 | },
213 | {
214 | "name": "Visited Greece",
215 | "description": "Miller visited Greece, invited by Lawrence Durrell who was living in Corfu.",
216 | "date": {
217 | "year": 1939,
218 | "note": ""
219 | },
220 | "location": "Greece",
221 | "id": "henry_miller_visited_greece"
222 | },
223 | {
224 | "name": "Returned to New York",
225 | "description": "Miller returned to New York.",
226 | "date": {
227 | "year": 1940,
228 | "note": ""
229 | },
230 | "location": "New York City",
231 | "id": "henry_miller_returned_to_new_york"
232 | },
233 | {
234 | "name": "Published The Colossus of Maroussi",
235 | "description": "Miller described his visit to Greece in 'The Colossus of Maroussi'.",
236 | "date": {
237 | "year": 1941,
238 | "note": ""
239 | },
240 | "location": "United States",
241 | "id": "henry_miller_published_colossus_of_maroussi"
242 | },
243 | {
244 | "name": "Moved to California",
245 | "description": "Miller moved to California in June, initially residing just outside Hollywood in Beverly Glen.",
246 | "date": {
247 | "year": 1942,
248 | "note": "June"
249 | },
250 | "location": "California",
251 | "id": "henry_miller_moved_to_california"
252 | },
253 | {
254 | "name": "Began writing Sexus",
255 | "description": "Miller began writing 'Sexus,' the first novel in 'The Rosy Crucifixion' trilogy.",
256 | "date": {
257 | "year": 1942,
258 | "note": ""
259 | },
260 | "location": "California",
261 | "id": "henry_miller_began_sexus"
262 | },
263 | {
264 | "name": "Settled in Big Sur",
265 | "description": "Miller settled in Big Sur.",
266 | "date": {
267 | "year": 1944,
268 | "note": ""
269 | },
270 | "location": "Big Sur, California",
271 | "id": "henry_miller_settled_big_sur"
272 | },
273 | {
274 | "name": "Married Janina Martha Lepska",
275 | "description": "Miller married Janina Martha Lepska.",
276 | "date": {
277 | "year": 1944,
278 | "note": ""
279 | },
280 | "location": "United States",
281 | "id": "henry_miller_married_janina_lepska"
282 | },
283 | {
284 | "name": "Published Sunday After the War",
285 | "description": "Miller published 'Sunday After the War'.",
286 | "date": {
287 | "year": 1944,
288 | "note": ""
289 | },
290 | "location": "United States",
291 | "id": "henry_miller_published_sunday_after_the_war"
292 | },
293 | {
294 | "name": "Published The Air-Conditioned Nightmare",
295 | "description": "Miller published 'The Air-Conditioned Nightmare'.",
296 | "date": {
297 | "year": 1945,
298 | "note": ""
299 | },
300 | "location": "United States",
301 | "id": "henry_miller_published_air_conditioned_nightmare"
302 | },
303 | {
304 | "name": "Lived in Big Sur with bohemian writers",
305 | "description": "Miller continued living in Big Sur with other bohemian writers.",
306 | "date": {
307 | "year": 1947,
308 | "note": "from 1947"
309 | },
310 | "location": "Big Sur, California",
311 | "id": "henry_miller_lived_big_sur_bohemians"
312 | },
313 | {
314 | "name": "Divorced Janina Martha Lepska",
315 | "description": "Miller was divorced from Janina Martha Lepska.",
316 | "date": {
317 | "year": 1952,
318 | "note": ""
319 | },
320 | "location": "United States",
321 | "id": "henry_miller_divorced_janina"
322 | },
323 | {
324 | "name": "Married artist Eve McClure",
325 | "description": "Miller married artist Eve McClure.",
326 | "date": {
327 | "year": 1953,
328 | "note": ""
329 | },
330 | "location": "United States",
331 | "id": "henry_miller_married_eve_mcclure"
332 | },
333 | {
334 | "name": "Published Big Sur and the Oranges of Hieronymus Bosch",
335 | "description": "Miller published 'Big Sur and the Oranges of Hieronymus Bosch'.",
336 | "date": {
337 | "year": 1957,
338 | "note": ""
339 | },
340 | "location": "United States",
341 | "id": "henry_miller_published_big_sur_oranges"
342 | },
343 | {
344 | "name": "Completed The Rosy Crucifixion trilogy",
345 | "description": "Miller completed 'The Rosy Crucifixion' trilogy (initially banned in the U.S., published in France and Japan).",
346 | "date": {
347 | "year": 1959,
348 | "note": ""
349 | },
350 | "location": "United States",
351 | "id": "henry_miller_completed_rosy_crucifixion"
352 | },
353 | {
354 | "name": "Divorced Eve McClure",
355 | "description": "Miller was divorced from Eve McClure.",
356 | "date": {
357 | "year": 1960,
358 | "note": ""
359 | },
360 | "location": "United States",
361 | "id": "henry_miller_divorced_eve"
362 | },
363 | {
364 | "name": "Reunion with June in New York",
365 | "description": "Miller arranged a reunion with ex-wife June in New York.",
366 | "date": {
367 | "year": 1961,
368 | "note": ""
369 | },
370 | "location": "New York",
371 | "id": "henry_miller_reunion_with_june"
372 | },
373 | {
374 | "name": "Tropic of Cancer published in the US",
375 | "description": "'Tropic of Cancer' was published in the United States by Grove Press, leading to obscenity trials.",
376 | "date": {
377 | "year": 1961,
378 | "note": ""
379 | },
380 | "location": "United States",
381 | "id": "henry_miller_tropic_cancer_us_publication"
382 | },
383 | {
384 | "name": "Moved to Pacific Palisades",
385 | "description": "Miller moved to 444 Ocampo Drive, Pacific Palisades, Los Angeles.",
386 | "date": {
387 | "year": 1963,
388 | "note": ""
389 | },
390 | "location": "Pacific Palisades, Los Angeles, California",
391 | "id": "henry_miller_moved_pacific_palisades"
392 | },
393 | {
394 | "name": "Married Hiroko Tokuda",
395 | "description": "Miller married Hiroko Tokuda.",
396 | "date": {
397 | "year": 1967,
398 | "note": ""
399 | },
400 | "location": "United States",
401 | "id": "henry_miller_married_hiroko_tokuda"
402 | },
403 | {
404 | "name": "Published On Turning Eighty",
405 | "description": "Miller published 'On Turning Eighty,' a chapbook with 200 copies.",
406 | "date": {
407 | "year": 1972,
408 | "note": ""
409 | },
410 | "location": "United States",
411 | "id": "henry_miller_published_on_turning_eighty"
412 | },
413 | {
414 | "name": "Nominated for Nobel Prize",
415 | "description": "Miller was nominated for the Nobel Prize in Literature by University of Copenhagen professor Allan Philip.",
416 | "date": {
417 | "year": 1973,
418 | "note": ""
419 | },
420 | "location": "",
421 | "id": "henry_miller_nobel_nomination"
422 | },
423 | {
424 | "name": "Divorced Hiroko Tokuda",
425 | "description": "Miller was divorced from Hiroko Tokuda.",
426 | "date": {
427 | "year": 1977,
428 | "note": ""
429 | },
430 | "location": "United States",
431 | "id": "henry_miller_divorced_hiroko"
432 | },
433 | {
434 | "name": "Henry Miller died",
435 | "description": "Henry Miller died of circulatory complications at home.",
436 | "date": {
437 | "year": 1980,
438 | "note": "June 7"
439 | },
440 | "location": "Pacific Palisades, Los Angeles",
441 | "id": "henry_miller_died"
442 | }
443 | ],
444 | "url_events_summarized": " Henry Valentine Miller was born at his family's home, 450 East 85th Street, in the Yorkville section of Manhattan, New York City, U.S. He was the son of Lutheran German parents, Louise Marie (Neiting) and tailor Heinrich Miller. Miller attended Eastern District High School in Williamsburg, Brooklyn, after finishing elementary school While he was a socialist, his idol was the black Socialist Hubert Harrison Miller married his first wife, Beatrice Sylvas Wickens, in 1917;[11] their divorce was granted on December 21, 1923.[12] Together they had a daughter, Barbara, born in 1919"
445 | }
446 |
--------------------------------------------------------------------------------