├── utils
├── __init__.py
└── call_llm.py
├── requirements.txt
├── assets
└── banner.png
├── flow.py
├── .gitignore
├── main.py
├── README.md
├── docs
└── design.md
├── data_profiling_report.md
├── test
└── patients.csv
├── nodes.py
├── .clinerules
└── .cursorrules
/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pocketflow>=0.0.1
2 | pandas>=2.0.0
3 | PyYAML>=6.0
4 | openai>=1.0.0
5 | google-genai
--------------------------------------------------------------------------------
/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Pocket/PocketFlow-Tutorial-Data-Profiler/main/assets/banner.png
--------------------------------------------------------------------------------
/utils/call_llm.py:
--------------------------------------------------------------------------------
1 | from google import genai
2 | import os
3 |
4 | def call_llm(prompt: str) -> str:
5 | """
6 | Call Google Gemini LLM with the given prompt.
7 |
8 | Args:
9 | prompt (str): The prompt to send to the LLM
10 |
11 | Returns:
12 | str: The response from the LLM
13 | """
14 | api_key = os.getenv("GEMINI_API_KEY", "Your API Key")
15 | client = genai.Client(api_key=api_key)
16 | model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro")
17 |
18 | response = client.models.generate_content(
19 | model=model,
20 | contents=[prompt]
21 | )
22 | return response.text
23 |
24 | if __name__ == "__main__":
25 | test_prompt = "Hello, how are you?"
26 |
27 | print("Making call...")
28 | response = call_llm(test_prompt)
29 | print(f"Response: {response}")
--------------------------------------------------------------------------------
/flow.py:
--------------------------------------------------------------------------------
1 | from pocketflow import Flow
2 | from nodes import (
3 | DuplicateDetectionNode,
4 | TableSummaryNode,
5 | ColumnDescriptionNode,
6 | DataTypeAnalysisNode,
7 | MissingValuesAnalysisNode,
8 | UniquenessAnalysisNode,
9 | UnusualValuesDetectionNode,
10 | GenerateReportNode
11 | )
12 |
13 | def create_data_profiling_flow():
14 | """Create and return a data profiling flow."""
15 |
16 | # Create all nodes
17 | duplicate_node = DuplicateDetectionNode()
18 | summary_node = TableSummaryNode()
19 | column_desc_node = ColumnDescriptionNode()
20 | data_type_node = DataTypeAnalysisNode()
21 | missing_values_node = MissingValuesAnalysisNode()
22 | uniqueness_node = UniquenessAnalysisNode()
23 | unusual_values_node = UnusualValuesDetectionNode()
24 | report_node = GenerateReportNode()
25 |
26 | # Connect nodes in sequence (following the workflow design)
27 | duplicate_node >> summary_node >> column_desc_node >> data_type_node >> missing_values_node >> uniqueness_node >> unusual_values_node >> report_node
28 |
29 | # Create flow starting with duplicate detection
30 | return Flow(start=duplicate_node)
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Dependencies
2 | node_modules/
3 | vendor/
4 | .pnp/
5 | .pnp.js
6 |
7 | # Build outputs
8 | dist/
9 | build/
10 | out/
11 | *.pyc
12 | __pycache__/
13 |
14 | # Environment files
15 | .env
16 | .env.local
17 | .env.*.local
18 | .env.development
19 | .env.test
20 | .env.production
21 |
22 | # IDE - VSCode
23 | .vscode/*
24 | !.vscode/settings.json
25 | !.vscode/tasks.json
26 | !.vscode/launch.json
27 | !.vscode/extensions.json
28 |
29 | # IDE - JetBrains
30 | .idea/
31 | *.iml
32 | *.iws
33 | *.ipr
34 |
35 | # IDE - Eclipse
36 | .project
37 | .classpath
38 | .settings/
39 |
40 | # Logs
41 | logs/
42 | *.log
43 | npm-debug.log*
44 | yarn-debug.log*
45 | yarn-error.log*
46 |
47 | # Operating System
48 | .DS_Store
49 | Thumbs.db
50 | *.swp
51 | *.swo
52 |
53 | # Testing
54 | coverage/
55 | .nyc_output/
56 |
57 | # Temporary files
58 | *.tmp
59 | *.temp
60 | .cache/
61 |
62 | # Compiled files
63 | *.com
64 | *.class
65 | *.dll
66 | *.exe
67 | *.o
68 | *.so
69 |
70 | # Package files
71 | *.7z
72 | *.dmg
73 | *.gz
74 | *.iso
75 | *.jar
76 | *.rar
77 | *.tar
78 | *.zip
79 |
80 | # Database
81 | *.sqlite
82 | *.sqlite3
83 | *.db
84 |
85 | # Optional npm cache directory
86 | .npm
87 |
88 | # Optional eslint cache
89 | .eslintcache
90 |
91 | # Optional REPL history
92 | .node_repl_history
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from flow import create_data_profiling_flow
3 |
4 | def main():
5 | """Main function for data profiling"""
6 |
7 | # Load the test dataset
8 | print("Loading patient data...")
9 | df = pd.read_csv("test/patients.csv")
10 | print(f"Loaded {len(df)} rows and {len(df.columns)} columns")
11 |
12 | # Initialize shared store with the data profiling structure
13 | shared = {
14 | "dataframe": df,
15 | "sample_data": "",
16 | "profile_results": {
17 | "duplicates": {},
18 | "table_summary": "",
19 | "column_descriptions": {},
20 | "data_types": {},
21 | "missing_values": {},
22 | "uniqueness": {},
23 | "unusual_values": {}
24 | },
25 | "final_report": ""
26 | }
27 |
28 | # Create and run the data profiling flow
29 | print("\nStarting data profiling analysis...")
30 | profiling_flow = create_data_profiling_flow()
31 | profiling_flow.run(shared)
32 |
33 | # Save the report first (avoid console encoding issues)
34 | with open("data_profiling_report.md", "w", encoding="utf-8") as f:
35 | f.write(shared["final_report"])
36 | print("\nReport saved to: data_profiling_report.md")
37 | print(f"Report contains {len(shared['final_report'])} characters")
38 |
39 | # Show basic stats instead of full report
40 | print("\n" + "="*50 + " SUMMARY " + "="*50)
41 | dup = shared["profile_results"]["duplicates"]
42 | print(f"✓ Analyzed {dup['total_rows']} rows, {len(shared['dataframe'].columns)} columns")
43 | print(f"✓ Found {dup['count']} duplicate rows ({dup['percentage']:.1f}%)")
44 | print(f"✓ Analysis complete - check data_profiling_report.md for full details")
45 | print("="*108)
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PocketFlow Data Profiling Tool
2 |
3 | An intelligent data profiling tool powered by LLMs that provides deep, contextual analysis of your datasets beyond traditional statistical metrics.
4 |
5 | ## 🎯 What This Tool Does
6 |
7 | This tool performs comprehensive data profiling through a 7-step workflow:
8 |
9 | 1. **Duplicate Detection** - Identifies and analyzes duplicate rows with recommendations
10 | 2. **Table Summary** - Generates high-level description of what your data represents
11 | 3. **Column Descriptions** - Analyzes each column with meaningful descriptions and naming suggestions
12 | 4. **Data Type Analysis** - Recommends optimal data types for each column
13 | 5. **Missing Values Analysis** - Categorizes missing values as meaningful vs problematic
14 | 6. **Uniqueness Analysis** - Identifies potential unique identifier columns
15 | 7. **Unusual Values Detection** - Detects outliers, anomalies, and data quality issues
16 |
17 | ## 🚀 How to Run
18 |
19 | ### Prerequisites
20 |
21 | 1. **Install dependencies:**
22 | ```bash
23 | pip install -r requirements.txt
24 | ```
25 |
26 | 2. **Set up your LLM:**
27 |
28 | The tool uses OpenAI by default. Set your API key:
29 | ```bash
30 | export OPENAI_API_KEY="your-key-here"
31 | ```
32 |
33 | To use your own LLM or different providers, check out the [PocketFlow LLM documentation](https://the-pocket.github.io/PocketFlow/utility_function/llm.html) and modify `utils/call_llm.py` accordingly.
34 |
35 | **Test your LLM setup:**
36 | ```bash
37 | python utils/call_llm.py
38 | ```
39 |
40 | ### Running the Tool
41 |
42 | ```bash
43 | python main.py
44 | ```
45 |
46 | By default, it analyzes the sample patient dataset in `test/patients.csv`. To analyze your own data, modify `main.py`:
47 |
48 | ```python
49 | # Replace this line:
50 | df = pd.read_csv("test/patients.csv")
51 |
52 | # With your data:
53 | df = pd.read_csv("path/to/your/data.csv")
54 | ```
55 |
56 | ### Output
57 |
58 | The tool generates:
59 | - **Console summary** with key statistics
60 | - **Markdown report** saved as `data_profiling_report.md` with comprehensive analysis
61 |
62 | ## 📊 Example Results
63 |
64 | From the sample patient dataset (60 rows, 27 columns):
65 |
66 | - ✅ Detected invalid SSN formats (test data with "999" prefix)
67 | - ✅ Identified name contamination (numeric suffixes in names)
68 | - ✅ Found meaningful missing patterns (83% missing death dates = living patients)
69 | - ✅ Recommended data type conversions (dates to datetime64, categories for demographics)
70 | - ✅ Identified unique identifiers (UUID primary key, SSN)
71 |
72 | ## 🏗️ Architecture
73 |
74 | Built with [PocketFlow](https://github.com/The-Pocket/PocketFlow) - a minimalist LLM framework:
75 |
76 | - **Workflow pattern** for sequential processing pipeline
77 | - **BatchNode** for efficient parallel column analysis
78 | - **YAML-based** structured outputs with validation
79 | - **Intelligent LLM analysis** for contextual understanding
80 |
81 | ## 📁 Project Structure
82 |
83 | ```
84 | ├── main.py # Entry point
85 | ├── flow.py # Flow orchestrator
86 | ├── nodes.py # All profiling nodes
87 | ├── utils/
88 | │ └── call_llm.py # LLM utility (customize for your provider)
89 | ├── test/
90 | │ └── patients.csv # Sample dataset
91 | └── docs/
92 | └── design.md # Design documentation
93 | ```
94 |
95 | ## 🔧 Customization
96 |
97 | ### Using Different LLM Providers
98 |
99 | Edit `utils/call_llm.py` to use your preferred LLM:
100 | - Claude (Anthropic)
101 | - Google Gemini
102 | - Azure OpenAI
103 | - Local models (Ollama)
104 |
105 | See the [PocketFlow LLM guide](https://the-pocket.github.io/PocketFlow/utility_function/llm.html) for examples.
106 |
107 | ### Analyzing Different Data Types
108 |
109 | The tool works with any pandas DataFrame. You can:
110 | - Load from CSV, Excel, JSON, Parquet
111 | - Connect to databases
112 | - Use API data
113 |
114 | Just ensure your data is loaded as a pandas DataFrame before running the flow.
115 |
116 | ## 🎓 Tutorial
117 |
118 | This project demonstrates **Agentic Coding** with [PocketFlow](https://github.com/The-Pocket/PocketFlow). Want to learn more?
119 |
120 | - Check out the [Agentic Coding Guidance](https://the-pocket.github.io/PocketFlow/guide.html)
121 | - Watch the [YouTube Tutorial](https://www.youtube.com/@ZacharyLLM?sub_confirmation=1)
122 |
123 | ## 📝 License
124 |
125 | This project is a tutorial example for PocketFlow.
126 |
--------------------------------------------------------------------------------
/docs/design.md:
--------------------------------------------------------------------------------
1 | # Design Doc: Data Profiling Tool
2 |
3 | > Please DON'T remove notes for AI
4 |
5 | ## Requirements
6 |
7 | > Notes for AI: Keep it simple and clear.
8 | > If the requirements are abstract, write concrete user stories
9 |
10 | **Problem**: Users need to understand their pandas DataFrame data quality and characteristics before analysis or modeling.
11 |
12 | **User Stories**:
13 | - As a data scientist, I want to automatically detect duplicate rows so I can decide whether to remove them
14 | - As an analyst, I want a high-level summary of my table to understand what the data represents
15 | - As a data engineer, I want detailed column descriptions to understand each field's meaning
16 | - As a developer, I want to identify correct data types for proper processing
17 | - As a researcher, I want to find missing values and understand if they're meaningful or problematic
18 | - As a quality analyst, I want to identify unique columns that could serve as identifiers
19 | - As a data validator, I want to detect unusual/outlier values that may indicate data quality issues
20 |
21 | ## Flow Design
22 |
23 | > Notes for AI:
24 | > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit.
25 | > 2. Present a concise, high-level description of the workflow.
26 |
27 | ### Applicable Design Pattern:
28 |
29 | 1. **Workflow**: Sequential processing pipeline where each step builds upon previous analysis
30 | 2. **Batch**: Some nodes (like column analysis) process multiple columns in parallel for efficiency
31 |
32 | ### Flow High-level Design:
33 |
34 | 1. **Duplicate Detection Node**: Analyzes the DataFrame for duplicate rows and provides statistics
35 | 2. **Table Summary Node**: Creates a high-level description of what the table represents
36 | 3. **Column Description Node**: Analyzes each column to provide meaningful descriptions and suggest better names
37 | 4. **Data Type Analysis Node**: Determines appropriate data types for each column
38 | 5. **Missing Values Analysis Node**: Identifies missing values and categorizes them as meaningful vs problematic
39 | 6. **Uniqueness Analysis Node**: Identifies columns that could serve as unique identifiers
40 | 7. **Unusual Values Detection Node**: Detects outliers and anomalous values in each column
41 |
42 | ```mermaid
43 | flowchart TD
44 | start[Start: Load DataFrame] --> duplicate[Duplicate Detection]
45 | duplicate --> summary[Table Summary]
46 | summary --> columns[Column Descriptions]
47 | columns --> datatypes[Data Type Analysis]
48 | datatypes --> missing[Missing Values Analysis]
49 | missing --> unique[Uniqueness Analysis]
50 | unique --> unusual[Unusual Values Detection]
51 | unusual --> report[Generate Final Report]
52 | ```
53 |
54 | ## Utility Functions
55 |
56 | > Notes for AI:
57 | > 1. Understand the utility function definition thoroughly by reviewing the doc.
58 | > 2. Include only the necessary utility functions, based on nodes in the flow.
59 |
60 | 1. **Call LLM** (`utils/call_llm.py`)
61 | - *Input*: prompt (str)
62 | - *Output*: response (str)
63 | - Used by all analysis nodes for intelligent data interpretation
64 |
65 | ## Node Design
66 |
67 | ### Shared Store
68 |
69 | > Notes for AI: Try to minimize data redundancy
70 |
71 | The shared store structure is organized as follows:
72 |
73 | ```python
74 | shared = {
75 | "dataframe": pd.DataFrame, # Original DataFrame
76 | "sample_data": str, # CSV sample for LLM analysis
77 | "profile_results": {
78 | "duplicates": {
79 | "count": int,
80 | "percentage": float,
81 | "sample_rows": str
82 | },
83 | "table_summary": str,
84 | "column_descriptions": {
85 | "col_name": {
86 | "description": str,
87 | "suggested_name": str
88 | }
89 | },
90 | "data_types": {
91 | "col_name": {
92 | "current_type": str,
93 | "suggested_type": str,
94 | "confidence": float
95 | }
96 | },
97 | "missing_values": {
98 | "col_name": {
99 | "count": int,
100 | "percentage": float,
101 | "likely_meaningful": bool,
102 | "reason": str
103 | }
104 | },
105 | "uniqueness": {
106 | "col_name": {
107 | "unique_count": int,
108 | "unique_percentage": float,
109 | "is_candidate_key": bool
110 | }
111 | },
112 | "unusual_values": {
113 | "col_name": {
114 | "has_unusual": bool,
115 | "unusual_samples": list,
116 | "explanation": str
117 | }
118 | }
119 | },
120 | "final_report": str # Comprehensive profiling report
121 | }
122 | ```
123 |
124 | ### Node Steps
125 |
126 | > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow.
127 |
128 | 1. **Duplicate Detection Node**
129 | - *Purpose*: Detect and analyze duplicate rows in the DataFrame
130 | - *Type*: Regular Node
131 | - *Steps*:
132 | - *prep*: Read "dataframe" from shared store and create sample
133 | - *exec*: Call LLM to analyze duplicate patterns and significance
134 | - *post*: Write duplicate analysis to "profile_results.duplicates"
135 |
136 | 2. **Table Summary Node**
137 | - *Purpose*: Generate high-level description of the table's purpose and content
138 | - *Type*: Regular Node
139 | - *Steps*:
140 | - *prep*: Read "dataframe" sample and column names from shared store
141 | - *exec*: Call LLM to generate comprehensive table summary
142 | - *post*: Write summary to "profile_results.table_summary"
143 |
144 | 3. **Column Description Node**
145 | - *Purpose*: Analyze each column to provide descriptions and name suggestions
146 | - *Type*: Batch Node (processes columns in chunks)
147 | - *Steps*:
148 | - *prep*: Return list of column chunks for parallel processing
149 | - *exec*: Call LLM to analyze each column chunk for descriptions
150 | - *post*: Combine results and write to "profile_results.column_descriptions"
151 |
152 | 4. **Data Type Analysis Node**
153 | - *Purpose*: Determine appropriate data types for each column
154 | - *Type*: Regular Node
155 | - *Steps*:
156 | - *prep*: Read "dataframe" and column info from shared store
157 | - *exec*: Call LLM to analyze data types with sample data
158 | - *post*: Write type analysis to "profile_results.data_types"
159 |
160 | 5. **Missing Values Analysis Node**
161 | - *Purpose*: Analyze missing values to determine if they're meaningful or problematic
162 | - *Type*: Regular Node
163 | - *Steps*:
164 | - *prep*: Read "dataframe" and calculate missing value statistics
165 | - *exec*: Call LLM to determine if missing values are meaningful
166 | - *post*: Write missing value analysis to "profile_results.missing_values"
167 |
168 | 6. **Uniqueness Analysis Node**
169 | - *Purpose*: Identify columns that could serve as unique identifiers
170 | - *Type*: Regular Node
171 | - *Steps*:
172 | - *prep*: Read "dataframe" and calculate uniqueness statistics
173 | - *exec*: Call LLM to determine candidate key columns
174 | - *post*: Write uniqueness analysis to "profile_results.uniqueness"
175 |
176 | 7. **Unusual Values Detection Node**
177 | - *Purpose*: Detect outliers and anomalous values in columns
178 | - *Type*: Batch Node (processes columns individually)
179 | - *Steps*:
180 | - *prep*: Return list of columns to analyze for unusual values
181 | - *exec*: Call LLM to analyze each column's value patterns
182 | - *post*: Write unusual value findings to "profile_results.unusual_values"
183 |
184 |
--------------------------------------------------------------------------------
/data_profiling_report.md:
--------------------------------------------------------------------------------
1 | # Data Profiling Report
2 |
3 | ## Table Summary
4 | This table represents a collection of detailed personal records for individuals. Each person is identified by an **Id**, and may also have an **SSN**, **DRIVERS** license, or **PASSPORT** number. Their full name is detailed with **PREFIX**, **FIRST**, **LAST**, **SUFFIX**, and a **MAIDEN** name if applicable.
5 |
6 | The records include vital and demographic information such as **BIRTHDATE**, **DEATHDATE**, **MARITAL** status, **RACE**, **ETHNICITY**, and **GENDER**. Geographic information specifies the person's **BIRTHPLACE** and their current residential **ADDRESS**, **CITY**, **STATE**, **COUNTY**, **FIPS** code, **ZIP**, and geographic coordinates (**LAT**, **LON**). Finally, the table contains financial information related to an individual's **HEALTHCARE_EXPENSES**, **HEALTHCARE_COVERAGE**, and **INCOME**.
7 |
8 | ## Duplicate Analysis
9 | - **Total rows**: 60
10 | - **Duplicate rows**: 0 (0.00%)
11 | - **Should remove**: False
12 | - **Analysis**: No duplicate rows found in the dataset.
13 |
14 | ## Column Descriptions
15 | - **Id** → *person_id*: A unique identifier for each record, formatted as a UUID (Universally Unique Identifier).
16 | - **BIRTHDATE** → *birth_date*: The person's date of birth in YYYY-MM-DD format.
17 | - **DEATHDATE** → *death_date*: The person's date of death in YYYY-MM-DD format. This field is empty if the person is alive.
18 | - **SSN** → *social_security_number*: The person's 9-digit Social Security Number, formatted as XXX-XX-XXXX.
19 | - **DRIVERS** → *drivers_license_number*: The person's driver's license number.
20 | - **PASSPORT** → *passport_number*: The person's passport number.
21 | - **PREFIX** → *name_prefix*: A title or honorific that precedes a person's name (e.g., 'Mr.', 'Mrs.', 'Dr.').
22 | - **FIRST** → *first_name*: The person's first or given name.
23 | - **LAST** → *last_name*: The person's last or family name.
24 | - **SUFFIX** → *name_suffix*: A suffix that follows a person's full name (e.g., 'Jr.', 'Sr.', 'III').
25 | - **MAIDEN** → *maiden_name*: The individual's last name at birth, often used for married individuals who have changed their name. Appears to have null values for those it does not apply to.
26 | - **MARITAL** → *marital_status*: The individual's marital status. The sample data uses 'M' likely for 'Married'.
27 | - **RACE** → *race*: The individual's self-identified race.
28 | - **ETHNICITY** → *ethnicity*: The individual's self-identified ethnicity, primarily indicating Hispanic or Non-Hispanic origin.
29 | - **GENDER** → *gender*: The individual's gender, represented by 'M' for Male and 'F' for Female.
30 | - **BIRTHPLACE** → *birth_place*: The location where the individual was born, as a single string containing city, state, and country.
31 | - **ADDRESS** → *street_address*: The street address of the individual's residence, including building number, street name, and unit/apartment number.
32 | - **CITY** → *city*: The city of the individual's residential address.
33 | - **STATE** → *state*: The state of the individual's residential address.
34 | - **COUNTY** → *county*: The county of the individual's residential address.
35 | - **FIPS** → *fips_code*: A FIPS (Federal Information Processing Standard) code, likely identifying a US county.
36 | - **ZIP** → *zip_code*: The 5-digit US postal ZIP code for the location.
37 | - **LAT** → *latitude*: The geographic latitude coordinate for the location.
38 | - **LON** → *longitude*: The geographic longitude coordinate for the location.
39 | - **HEALTHCARE_EXPENSES** → *healthcare_expenses_usd*: A monetary value representing healthcare-related expenses, likely per capita or household, in USD.
40 | - **HEALTHCARE_COVERAGE** → *healthcare_coverage_value_usd*: A monetary value related to healthcare coverage, possibly representing total premiums or insured value in the area.
41 | - **INCOME** → *median_income_usd*: A monetary value representing the average or median income for the area, likely in USD.
42 |
43 | ## Data Type Analysis
44 | - **BIRTHDATE**: object → *datetime64* (The column contains date values in a standard 'YYYY-MM-DD' format.)
45 | - **DEATHDATE**: object → *datetime64* (The column contains date values and empty strings, which can be represented as dates and Not a Time (NaT) values.)
46 | - **PREFIX**: object → *category* (The column has a small number of repeated string values (e.g., 'Mr.', 'Mrs.', 'Ms.'), making it ideal for the memory-efficient category type.)
47 | - **SUFFIX**: object → *category* (This column likely contains a small, fixed set of name suffixes (e.g. 'Jr.', 'Sr.'), making it suitable for the category type.)
48 | - **MARITAL**: object → *category* (The column represents marital status and likely has a small number of distinct values ('M', 'S', etc.), making it ideal for the category type.)
49 | - **RACE**: object → *category* (The column contains a small, well-defined set of values for race, which is a classic categorical variable.)
50 | - **ETHNICITY**: object → *category* (The column contains a small, well-defined set of values for ethnicity, making it a categorical variable.)
51 | - **GENDER**: object → *category* (The column has a very small number of distinct values ('M', 'F'), making it a prime candidate for the category type.)
52 | - **CITY**: object → *category* (The number of unique city names is much smaller than the total number of records, making 'category' a memory-efficient choice.)
53 | - **STATE**: object → *category* (The number of unique states is very small and fixed, making this an ideal categorical variable.)
54 | - **COUNTY**: object → *category* (The number of unique counties is finite and much smaller than the number of records, making 'category' a memory-efficient choice.)
55 | - **FIPS**: float64 → *category* (FIPS codes are categorical identifiers for geographic locations. Using 'category' is memory efficient and semantically correct as they are not used for mathematical operations.)
56 | - **ZIP**: int64 → *category* (ZIP codes are geographic identifiers. While numeric, they are not used for calculations. Using 'category' is memory-efficient and avoids issues with leading zeros.)
57 |
58 | ## Missing Values Analysis
59 | **Overview**: The dataset exhibits both meaningful and problematic missingness. Fields like DEATHDATE, SUFFIX, and MAIDEN have high percentages of missing values that are expected and informative, indicating a specific status (e.g., 'alive' or 'not applicable'). Conversely, fields like MARITAL and FIPS have missing values that represent genuine data quality gaps, hindering demographic and geographic analysis.
60 |
61 | ### Problematic Missing Values
62 | - **PREFIX**: 10 missing (16.7%) - Prefixes (Mr., Ms., etc.) are often optional fields. While their absence is common, it represents incomplete data rather than a specific status, making it a minor data quality issue.
63 | - **MARITAL**: 20 missing (33.3%) - Marital status is a core demographic attribute. A 33.3% missing rate is a significant data quality problem, as the absence does not imply a default status (like 'single') and creates gaps in analysis.
64 | - **FIPS**: 14 missing (23.3%) - FIPS is a standardized geographic code for a county. Since COUNTY data exists, the FIPS code should be derivable. Its absence is a data processing or quality issue that hinders standardized geographic analysis.
65 |
66 | ### Likely Meaningful Missing Values
67 | - **DEATHDATE**: 50 missing (83.3%) - The high percentage of missing values (83.3%) strongly suggests that a blank DEATHDATE indicates the person is still alive. The absence of data is the data.
68 | - **DRIVERS**: 6 missing (10.0%) - A missing driver's license number likely means the person does not have one, which could be due to age (minors) or personal choice. It is not necessarily an error.
69 | - **PASSPORT**: 13 missing (21.7%) - Similar to a driver's license, not every individual has a passport. A missing value indicates the person likely does not possess one.
70 | - **SUFFIX**: 56 missing (93.3%) - Name suffixes (Jr., III, etc.) are rare. The very high percentage of missing values (93.3%) correctly reflects that most people do not have one.
71 | - **MAIDEN**: 49 missing (81.7%) - A maiden name is only applicable to a subset of the population (typically, married individuals who changed their name). A blank value is expected for males, unmarried individuals, or those who kept their original name.
72 |
73 | ## Uniqueness Analysis
74 | ### Candidate Key Columns
75 | - **Id**: This column is a system-generated unique identifier (like a UUID) for each record. The table context states it identifies each person, and the data analysis confirms it is 100% unique. It's designed specifically to be a primary key.
76 | - **SSN**: A Social Security Number is a government-issued number intended to be a unique identifier for each person in the United States. It is 100% unique in the sample data and is a strong candidate for a natural key, despite its sensitive nature.
77 |
78 | ### Highly Unique Columns
79 | - **BIRTHDATE**: 83.3% unique
80 | - **DRIVERS**: 90.0% unique
81 | - **PASSPORT**: 78.3% unique
82 | - **FIRST**: 98.3% unique
83 | - **LAST**: 88.3% unique
84 | - **BIRTHPLACE**: 80.0% unique
85 | - **ADDRESS**: 100.0% unique
86 | - **CITY**: 66.7% unique
87 | - **ZIP**: 60.0% unique
88 | - **LAT**: 100.0% unique
89 | - **LON**: 100.0% unique
90 | - **HEALTHCARE_EXPENSES**: 100.0% unique
91 | - **HEALTHCARE_COVERAGE**: 98.3% unique
92 | - **INCOME**: 83.3% unique
93 |
94 | ## Unusual Values Detection
95 | - **SSN**: All sample values begin with the area number '999'. The Social Security Administration (SSA) does not issue SSNs with area numbers (the first three digits) in the 900-999 range. These values are invalid and likely represent dummy or placeholder data.
96 | - **FIRST**: The column 'FIRST' is expected to contain first names. However, all sample values are a mix of text and numbers (e.g., 'Mel236', 'Cheyenne169'). This suggests that names have been concatenated with a numeric ID or code, which is unusual for a standard first name field.
97 | - **LAST**: The values in the 'LAST' column consistently follow a pattern of a name followed by a three-digit number (e.g., 'Bailey598'). This is unusual because a column named 'LAST' is expected to contain only the last name. The presence of appended numbers suggests a potential data quality issue where a name and a numeric ID have been merged into a single field.
98 | - **MAIDEN**: The values in the 'MAIDEN' column consistently follow a pattern of a name followed by a sequence of numbers (e.g., 'Lowe577'). A column representing a maiden name would typically contain only alphabetic characters. The presence of appended numbers is unusual and suggests the column may be a concatenation of a name and a numeric identifier.
99 | - **FIPS**: The values appear to be valid 5-digit county FIPS codes. However, they are stored as floats (float64) instead of strings. FIPS codes are identifiers, not numerical quantities, and should be stored as strings to prevent issues like the loss of leading zeros (e.g., '01001' becoming 1001.0) and to reflect their categorical nature. The trailing '.0' in each sample is an artifact of this incorrect data type.
100 | - **ZIP**: The value '0' is present, which is not a valid ZIP or postal code. This value likely represents missing data, a default entry, or an error during data conversion.
101 | - **HEALTHCARE_COVERAGE**: The presence of `0.0` is unusual. It's ambiguous whether this represents a valid state (no coverage) or is a placeholder for missing data. Additionally, the data has a very wide range, and the value `1777031.06` is a potential high-end outlier, being significantly larger than the other sample values.
102 |
--------------------------------------------------------------------------------
/test/patients.csv:
--------------------------------------------------------------------------------
1 | Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
2 | eb247227-e839-88d3-447d-b5972468f33b,2021-09-23,,999-41-1756,,,,Mel236,Bailey598,,,,white,nonhispanic,M,Norton Center Massachusetts US,716 Wunsch Gardens Unit 48,Framingham,Massachusetts,Middlesex County,25017,01701,42.27565048847629,-71.4763670033942,2520.80,4323.64,170754
3 | 2ffa361e-5858-877e-e022-ce81fe32da1b,1944-05-31,,999-33-4589,S99957814,X45639058X,Mrs.,Cheyenne169,Marks830,,Lowe577,M,white,nonhispanic,F,Longmeadow Massachusetts US,123 Bayer Camp,Taunton,Massachusetts,Bristol County,25005,02718,41.89288420730215,-71.06668598167076,205342.20,94647.00,40526
4 | 3dfb065a-67df-5b8a-3901-49bfd834bed1,2009-02-08,,999-59-2568,,,,Hunter736,Keebler762,,,,white,nonhispanic,M,Maynard Massachusetts US,575 Jast Rue Unit 48,Winchendon,Massachusetts,Worcester County,25027,01475,42.670059014687666,-72.07466425723803,16381.92,17447.87,79884
5 | db80575b-5e9b-921b-fad9-1e3a20929dc7,1979-06-26,1995-07-04,999-77-7700,S99968506,,,Herschel574,Ernser583,,,,asian,nonhispanic,M,Somerville Massachusetts US,184 Langworth Parade Apt 10,Boston,Massachusetts,Suffolk County,25025,02131,42.39551626795498,-71.05901494925675,3850.00,44057.32,6420
6 | d84815a3-c5b3-8ca2-025f-6323a4ec59ef,1973-05-31,,999-29-2359,S99967405,X86891718X,Mrs.,Lacey714,Heathcote539,,Hegmann834,M,white,nonhispanic,F,Natick Massachusetts US,801 Morissette Divide,Hingham,Massachusetts,Plymouth County,25023,02043,42.20072325055452,-70.83659045847199,66662.10,1777031.06,933420
7 | 7ec76836-c039-d9bf-8bb9-fe488c66d452,2003-01-13,,999-42-9847,S99998925,,Ms.,Adelia946,Collier206,,,,white,nonhispanic,F,Marshfield Massachusetts US,459 Larson Union,Boston,Massachusetts,Suffolk County,25025,02134,42.31550631209828,-71.05169551644717,4050.00,158604.59,1361
8 | 79297a39-2d2d-d88d-5e47-7a521af1d69f,1998-12-14,,999-49-9846,S99945605,X69843358X,Mr.,Hayden835,Casper496,,,,white,nonhispanic,M,Charlton Massachusetts US,589 Conroy Approach,Belmont,Massachusetts,Middlesex County,25017,02472,42.36153882121102,-71.20913616208074,40347.11,180076.32,51861
9 | 734e5f3c-e660-6cbe-7c26-c5264cbde68e,2005-03-03,,999-71-8314,S99991875,,,Herb645,Willms744,,,,white,hispanic,M,Melrose Massachusetts US,980 Koss Plaza Apt 11,Brockton,Massachusetts,Plymouth County,25023,02302,42.11165412918098,-71.0259065985567,390568.25,0.00,35002
10 | 750cdaf4-c264-e967-e76b-53a5a61abcab,1983-02-18,,999-95-3792,S99957390,X63804957X,Mr.,Stewart672,Schimmel440,,,M,white,nonhispanic,M,Fall River Massachusetts US,843 Yost Spur Unit 81,Sharon,Massachusetts,Norfolk County,25021,02067,42.10147261542774,-71.2054748347118,7321.10,176776.70,10335
11 | 285cba54-c91d-6db4-4d78-1ea35ba6b622,1998-10-30,,999-44-2795,S99942670,X49037240X,Ms.,Jenae263,Becker968,,,,white,nonhispanic,F,Somerville Massachusetts US,248 Ernser Terrace Suite 86,Lynn,Massachusetts,Essex County,25009,01901,42.49850442782566,-71.03582388708702,34690.56,626729.36,56421
12 | 064ef124-22ef-af09-1940-0fec6c3574bc,1972-05-01,,999-29-7349,S99945886,X86223344X,Ms.,Andera917,Lemke654,,,S,white,nonhispanic,F,Kingston Massachusetts US,606 Price View Unit 89,Boston,Massachusetts,Suffolk County,25025,02116,42.428281838517776,-71.03071500996144,591573.83,346350.22,28232
13 | df6bcea7-a0c7-6ed0-e9e4-fd1dc33b76f7,1965-08-03,,999-38-7473,S99953338,X64982272X,Mrs.,Linn541,Gislason620,,Hermann103,M,white,nonhispanic,F,Westwood Massachusetts US,275 Tromp Burg Suite 54,Erving,Massachusetts,Franklin County,,00000,42.6366197794778,-72.38328626727915,54801.69,228252.18,83088
14 | cfa94700-7440-d5f7-516a-bae08cb365a7,2022-08-14,,999-20-5403,,,,Kaycee352,Koss676,,,,white,nonhispanic,F,New Bedford Massachusetts US,859 Hansen Mission Apt 56,Montague,Massachusetts,Franklin County,,00000,42.590026050563246,-72.52614825581455,1644.00,1310.64,147152
15 | 52f8df2b-25a8-fbba-af75-0e11f3a054d4,2000-10-21,,999-16-4297,S99952319,X27281996X,Ms.,Cindy893,Lueilwitz711,,,,asian,nonhispanic,F,Hanoi Hà Đông VN,892 Haag Gateway Unit 67,Boston,Massachusetts,Suffolk County,25025,02120,42.31347958243134,-71.1029639299087,8065.00,917055.49,3758
16 | bc1efffb-0983-081f-d4c4-3345f6f2abbd,2009-05-16,,999-43-4282,,,,Huey641,Schumm995,,,,black,nonhispanic,M,Needham Massachusetts US,835 Powlowski Junction Suite 1,Danvers,Massachusetts,Essex County,25009,01923,42.60647530229309,-71.02623584083403,13076.83,5093.56,74906
17 | b3b71304-fe5b-bda4-6822-bd901b2836d1,1962-05-14,,999-48-5926,S99948203,X32413718X,Mr.,Antony83,Armstrong51,,,M,white,nonhispanic,M,Fall River Massachusetts US,830 Dare Park Apt 34,Marshfield,Massachusetts,Plymouth County,25023,02050,42.162629807392534,-70.73981579334742,41148.04,632431.62,35673
18 | fd0b726d-b7e6-976d-7cda-8679dd849610,1965-01-01,,999-96-6743,S99949277,X14857165X,Ms.,Daniela614,Rico947,,,S,white,hispanic,F,Bayamon Puerto Rico PR,279 Grady Estate,Boston,Massachusetts,Suffolk County,25025,02120,42.32851940354027,-71.03026624879821,54097.09,478630.93,93689
19 | 53534989-404e-cc7c-2859-1708edba296c,1959-05-28,,999-14-9672,S99998623,X86168167X,Mr.,Fletcher87,O'Conner199,,,S,white,nonhispanic,M,Wellesley Massachusetts US,873 Ledner Hollow Unit 28,Gardner,Massachusetts,Worcester County,25027,01440,42.56780989156895,-72.00057697137188,329359.12,160957.56,40192
20 | fca2a21e-3319-131a-7e84-ff984b871e16,1979-06-26,,999-29-4844,S99938158,X13579933X,Mr.,Kirk871,Nolan344,,,S,asian,nonhispanic,M,Billerica Massachusetts US,356 Wintheiser Passage,Boston,Massachusetts,Suffolk County,25025,02109,42.382825730736656,-71.060338397059,12300.34,775237.82,6420
21 | d89557ad-d741-8ea5-b542-c1226a781d83,1963-12-09,,999-22-2635,S99975865,X35559645X,Mr.,Steve819,Brakus656,,,M,white,nonhispanic,M,Quincy Massachusetts US,949 Langworth Light Apt 7,Yarmouth,Massachusetts,Barnstable County,,00000,41.67994080784203,-70.22659782831724,517636.61,2883.47,51527
22 | c8fbb10b-b54e-8182-d71c-c552bd1c58b1,1976-04-03,,999-50-5697,S99983337,X69114774X,Mr.,Hayden835,Schumm995,,,S,white,nonhispanic,M,Braintree Massachusetts US,856 Gusikowski Lane,North Adams,Massachusetts,Berkshire County,25003,01247,42.63685002791058,-73.08831197031574,190214.57,27591.19,36603
23 | 1d13ebc3-0635-059a-5fe9-82c92ede84ec,2006-05-08,,999-79-3695,S99923330,,,Moises22,O'Conner199,,,,white,hispanic,M,Saugus Massachusetts US,459 Cassin Forge Suite 9,Baldwinville,Massachusetts,Worcester County,25027,01436,42.603196202455976,-72.08703841216905,20035.70,11871.97,69174
24 | 0cd0df97-9d92-5d95-fbde-6b0a7e6af1c8,1932-07-09,1941-07-07,999-55-6098,,,,Coleman27,Kreiger457,,,,white,nonhispanic,M,Boston Massachusetts US,785 Ankunding Drive,Scituate,Massachusetts,Plymouth County,25023,02066,42.18599323192734,-70.79923081726402,9206.41,38661.38,870606
25 | fa451eba-6815-0d99-fd91-02f5581d914b,1946-11-12,,999-79-2426,S99940159,X53572828X,Ms.,Lillia547,Nolan344,,,S,white,nonhispanic,F,Chelsea Massachusetts US,287 Medhurst Bypass,Saugus,Massachusetts,Essex County,25009,01906,42.42536923970955,-71.00203089714843,64688.84,1193878.90,99091
26 | 92bc26b1-c317-7db6-492e-3c8ea452b36d,2005-04-22,,999-57-7157,S99990370,,,Vanesa40,Anderson154,,,,white,nonhispanic,F,Lawrence Massachusetts US,990 Hyatt Gateway,Chicopee,Massachusetts,Hampden County,25013,01013,42.20443967658899,-72.59684936418238,23076.68,14653.34,27706
27 | 8cfec0eb-f022-f332-55f5-38a2c35f5b84,2002-08-14,,999-93-7263,S99928764,X60540972X,Ms.,Mirta419,Hayes766,,,,white,nonhispanic,F,Boston Massachusetts US,737 Hauck Estate,Holliston,Massachusetts,Middlesex County,,00000,42.19537880557404,-71.40405413382206,11458.37,763952.25,17389
28 | 27948426-0f88-0e3b-dd6b-8bd9d8512892,1991-04-23,,999-99-2106,S99915210,X21909602X,Ms.,Shiela18,Jenkins714,,,S,white,nonhispanic,F,Methuen Massachusetts US,931 Lowe Route,Boston,Massachusetts,Suffolk County,25025,02111,42.366094173491376,-71.0766434184029,37489.68,507808.67,172784
29 | 548300bb-3152-531c-d895-d44fbf2ff1ba,1972-10-14,,999-17-2611,S99989489,X58425716X,Mrs.,Christal240,Hoppe518,,Gorczany269,M,white,nonhispanic,F,Boston Massachusetts US,298 Ryan Corner Suite 66,Sandwich,Massachusetts,Barnstable County,25001,02563,41.710640404859575,-70.46063787198887,53359.37,719097.64,91140
30 | 821960e1-9db8-7b56-a359-ac34d9228fbc,1960-04-26,,999-61-6611,S99976250,X16809119X,Mrs.,Guadalupe206,Bermúdez789,,Barela183,M,white,hispanic,F,Ponce Puerto Rico PR,336 Nienow Course,Tyngsborough,Massachusetts,Middlesex County,,00000,42.66681617186233,-71.47460669911267,67147.48,794967.24,64210
31 | 100881d9-7b59-8060-2772-f41b276970fc,1985-01-06,,999-89-9127,S99998550,X39158329X,Mrs.,Corey514,Johnson679,,Beier427,M,white,nonhispanic,F,Norton Massachusetts US,412 Spinka Plaza,Quincy,Massachusetts,Norfolk County,25021,02171,42.2627318928807,-71.01242779795183,41414.26,543793.88,129073
32 | d4db8ae1-3354-d064-508d-834cfa214cb2,1958-07-12,,999-35-8448,S99991458,X15138167X,Mr.,Gayle448,MacGyver246,,,M,white,nonhispanic,M,Framingham Massachusetts US,138 Hilll Well,Yarmouth,Massachusetts,Barnstable County,,00000,41.679741071089055,-70.17797861520093,79882.99,32342.26,21813
33 | d7f0610d-ec4d-fb9e-2d0d-b54e3ad621fb,1978-12-22,,999-88-1792,S99963549,X35369763X,Mr.,Whitney250,Hamill307,,,M,white,hispanic,M,Walpole Massachusetts US,416 Oberbrunner Dam Apt 95,Worcester,Massachusetts,Worcester County,25027,01603,42.29434089325724,-71.80056312559488,36814.41,239.57,44580
34 | e8323f7c-6829-3bce-0621-63d588b2e901,2010-09-21,,999-32-4862,,,,Dione665,Wilkinson796,,,,white,nonhispanic,F,Grafton Massachusetts US,455 Rutherford Lock,Lawrence,Massachusetts,Essex County,25009,01841,42.65392733597755,-71.1397130605323,13474.46,27265.47,64612
35 | 2995bf9b-5760-2099-77fe-ba01250cec42,1953-05-07,,999-10-1178,S99915523,X21910865X,Ms.,Shayla126,Rath779,,,S,white,nonhispanic,F,Fitchburg Massachusetts US,106 Hane Skyway Suite 0,Hampden,Massachusetts,Hampden County,,00000,42.07986301784488,-72.43670689724918,16242.76,800689.76,15098
36 | b4ea2bfe-cd6b-92a6-ff78-d2e995243894,1932-07-09,1966-11-11,999-70-2405,S99928700,X87718021X,Mr.,Damon455,Kshlerin58,JD,,S,white,nonhispanic,M,Cambridge Massachusetts US,803 Powlowski Park,Scituate,Massachusetts,Plymouth County,25023,02066,42.16374680818888,-70.80708512636961,23445.48,18839.51,870606
37 | ee98453d-79ed-910b-2e4a-9b32d9350fb6,1991-03-13,,999-70-4594,S99922993,X63601019X,Mr.,Luke971,Trantow673,,,M,asian,nonhispanic,M,Haiphong Kiến An VN,653 Jones Run Suite 14,Leominster,Massachusetts,Worcester County,25027,01453,42.55158257952831,-71.7700198666129,24117.54,26060.37,139930
38 | 98308074-8188-7b69-a1d1-be735cdc3ff4,1997-08-01,,999-14-9380,S99982480,X30896463X,Ms.,Carrol931,Rutherford999,,,,white,nonhispanic,F,Malden Massachusetts US,401 Reichel Route Suite 47,Pittsfield,Massachusetts,Berkshire County,25003,01201,42.414512991574455,-73.30118644516655,17732.85,963178.27,143644
39 | 2333e462-582c-9c83-d382-4c5e0c2c1ad0,2000-09-29,,999-58-7543,S99923245,X14074343X,Ms.,Lavette209,Zboncak558,,,,black,nonhispanic,F,Medfield Massachusetts US,994 Feest Crossroad Apt 13,Marblehead,Massachusetts,Essex County,25009,01945,42.49472255145156,-70.81702859735168,31189.37,595880.21,129433
40 | 931c7fd6-6330-1008-cef4-df84dd836d15,2002-10-04,,999-26-4422,S99938533,X25071442X,Mr.,Burton124,Stehr398,,,,white,nonhispanic,M,Taunton Massachusetts US,619 Upton Landing Apt 9,Chelmsford,Massachusetts,Middlesex County,,00000,42.548797580393405,-71.3540262724428,25994.44,467794.53,185360
41 | b3f13b30-5802-e5f3-685b-36c3c09283f1,2003-03-10,,999-20-4271,S99992852,,Ms.,Lessie363,Langworth352,,,,white,nonhispanic,F,Lynnfield Massachusetts US,745 Koelpin Trailer,Westborough,Massachusetts,Worcester County,25027,01581,42.30108671835689,-71.57704229893056,5139.91,88026.45,9657
42 | 4860e9a0-1263-6ca4-fe42-b7a73cbeec16,1947-12-14,2011-08-10,999-37-8682,S99927544,X81110019X,Mr.,Dominick530,Mills423,,,M,white,nonhispanic,M,Lynn Massachusetts US,1025 Spinka Overpass Suite 19,Wilbraham,Massachusetts,Hampden County,25013,01095,42.11528427077081,-72.45768605820261,58407.08,40127.28,326248
43 | 87424a5e-7848-aed5-fd59-4c8a76c2ed36,1965-08-08,,999-36-1150,S99945201,X58343864X,Mrs.,Zoila41,McGlynn426,,DuBuque211,M,white,nonhispanic,F,Brookline Massachusetts US,867 Langosh Grove Apt 84,Fairhaven,Massachusetts,Bristol County,,00000,41.63250541278397,-70.87313339013353,15950.00,1425035.79,15144
44 | 26a90721-54f3-b755-ecf4-a8aab978c01c,1963-02-27,,999-55-3195,S99981647,X12176676X,Mr.,Martín25,Roldán470,,,M,white,hispanic,M,Buenos Aires Ciudad de Buenos Aires AR,382 Satterfield Annex Suite 45,Ludlow,Massachusetts,Hampden County,,00000,42.219739387326875,-72.45861669907123,829967.44,0.00,49667
45 | 06d7ef99-093b-fe84-6d7c-52a3eab126fe,1955-04-04,,999-99-2436,S99953324,X10127197X,Ms.,Yetta429,Doyle959,,,S,white,nonhispanic,F,Grafton Massachusetts US,762 Senger Lodge,New Bedford,Massachusetts,Bristol County,25005,02740,41.62800607094596,-70.98717778570096,1079260.67,33271.04,49980
46 | 74d4ca38-9f05-2212-c539-44139fdd8ab4,2003-08-20,,999-44-9634,S99977402,,Mr.,Elden718,Collins926,,,,white,nonhispanic,M,Salem Massachusetts US,343 Reynolds Lock Unit 95,Cambridge,Massachusetts,Middlesex County,25017,02141,42.41937500378478,-71.10280714780403,7303.71,44500.70,16133
47 | f56c230b-3a7c-aca2-6363-fa3d46cf6596,1985-02-05,,999-51-3221,S99937081,X44375219X,Mrs.,Shaquana156,MacGyver246,,Deckow585,M,black,nonhispanic,F,Brockton Massachusetts US,282 Wintheiser Quay Suite 46,Sharon,Massachusetts,Norfolk County,25021,02067,42.07048110807395,-71.20906020387682,45950.72,726541.39,397408
48 | 37279a07-035d-e18b-bcd7-331dc3fe6304,1975-08-17,,999-91-9580,S99984964,X59848524X,Mr.,Jimmie93,Graham902,,,M,asian,nonhispanic,M,Beijing Beijing Municipality CN,269 Jones Estate Apt 87,Lynn,Massachusetts,Essex County,25009,01901,42.540185973577316,-70.95005850290717,50890.22,105568.63,58204
49 | 2d5a8517-f25f-9f66-9ab8-0a69425145c7,1960-04-25,,999-62-7937,S99914972,X40059543X,Mr.,Manuel446,Quitzon246,,,M,white,nonhispanic,M,New Bedford Massachusetts US,828 Hahn Ferry Suite 62,Raynham,Massachusetts,Bristol County,,00000,41.93105766594213,-71.08785486712216,54306.03,143291.33,77316
50 | 1666a800-a041-a2ca-4f9b-af668e740370,1963-06-05,,999-30-8851,S99915586,X37464865X,Mr.,Grant908,Hahn503,,,M,white,nonhispanic,M,Boston Massachusetts US,168 Ernser Viaduct Apt 32,Cambridge,Massachusetts,Middlesex County,25017,02141,42.33909824190683,-71.11622581495207,51477.06,68767.89,224975
51 | 8762040e-69bb-6ac8-685c-1d63a3d4dfe2,1960-02-25,,999-10-6028,S99958879,X62869090X,Ms.,Carmelita854,Hagenes547,,,S,white,nonhispanic,F,Taunton Massachusetts US,900 McClure Fort,Salem,Massachusetts,Essex County,25009,01970,42.52904093176596,-70.86167782746726,16750.00,1561478.75,13488
52 | 48334e94-64e7-91e8-d91b-1246110bf1ba,1932-07-09,2004-12-07,999-23-9351,S99916924,X14993625X,Mr.,Leandro563,Hane680,JD,,M,white,nonhispanic,M,Duxbury Massachusetts US,166 Jerde Avenue,Scituate,Massachusetts,Plymouth County,25023,02066,42.25402035749961,-70.7386488899729,62366.99,201314.86,870606
53 | a68f0fcf-424b-d8dd-3949-a69f0f3f9979,1935-09-06,2020-07-09,999-12-9121,S99987192,X75313923X,Mrs.,Janeth814,Feest103,,Shanahan202,M,white,nonhispanic,F,Boxford Massachusetts US,175 Mayer Frontage road Apt 63,East Longmeadow,Massachusetts,Hampden County,,00000,42.04366398532018,-72.53842235808679,909196.90,370234.71,29277
54 | 4f233603-d38e-fec1-7106-b6a09c62f28e,1947-12-14,2012-02-06,999-61-6740,S99975126,X69639917X,Mr.,Raymon366,Orn563,,,M,white,nonhispanic,M,East Bridgewater Massachusetts US,144 Waters Bypass Apt 13,Wilbraham,Massachusetts,Hampden County,25013,01095,42.1070174210795,-72.46946143213026,61483.39,148955.95,326248
55 | 561bc09a-56b9-859e-b926-fc66685d9df1,1946-03-21,,999-73-5643,S99995586,X13967823X,Mr.,Tyrell880,Schimmel440,,,M,white,nonhispanic,M,Georgetown Massachusetts US,102 Waters Estate Unit 93,New Bedford,Massachusetts,Bristol County,25005,02743,41.76092573197442,-70.93429596770599,69632.06,271187.57,62948
56 | 0c603e3d-ff1b-936d-14aa-9e875fa47cad,1932-07-09,2012-05-11,999-85-2178,S99970498,X20385492X,Mr.,Irving123,Hamill307,PhD,,M,white,nonhispanic,M,Kingston Massachusetts US,880 Bauch Lodge,Scituate,Massachusetts,Plymouth County,25023,02066,42.1894755210766,-70.72442915485766,67284.69,105065.45,870606
57 | da069417-667e-3b7e-8730-b00bf5dbcd7f,1935-09-06,2015-02-25,999-50-5586,S99935886,X71918603X,Mrs.,Ethel888,Corwin846,,Predovic534,M,white,nonhispanic,F,New Bedford Massachusetts US,344 Miller Street,East Longmeadow,Massachusetts,Hampden County,,00000,42.06043518424488,-72.46281354520333,724339.28,171776.51,29277
58 | dd19ae3a-2f3a-a636-f2bf-f3fe51e3ff7d,1947-12-14,2011-07-21,999-61-4140,S99946445,X63265370X,Mr.,Cedrick207,Cruickshank494,,,M,white,nonhispanic,M,Stoneham Massachusetts US,1051 Brakus Center Unit 33,Wilbraham,Massachusetts,Hampden County,25013,01095,42.153689936905664,-72.46563770143045,45887.05,1111058.21,326248
59 | 86636875-af39-df1b-edd7-209e8ffb77d2,1932-07-09,,999-26-8041,S99949411,X80854357X,Mr.,Garry927,Nikolaus26,MD,,M,white,nonhispanic,M,Rockland Massachusetts US,609 Paucek Skyway,Scituate,Massachusetts,Plymouth County,25023,02066,42.21737593221302,-70.71331268909627,53754.99,77450.36,870606
60 | 5dbb559c-7f46-f0bb-58ad-9069d1cce9b4,1935-09-06,,999-23-4696,S99949724,X57424902X,Mrs.,Hildred696,Casper496,,Kohler843,M,white,nonhispanic,F,Malden Massachusetts US,638 Bradtke Hollow,East Longmeadow,Massachusetts,Hampden County,,00000,42.04728057003359,-72.49659986178744,983859.74,213860.48,29277
61 | 9cfb6988-97da-cc56-58d4-81be46378f43,1947-12-14,,999-56-9201,S99974442,X17350166X,Mr.,Lonny638,Klein929,,,M,white,nonhispanic,M,Maynard Massachusetts US,848 Kuphal Junction,Wilbraham,Massachusetts,Hampden County,25013,01095,42.13813414111617,-72.45483019554929,71706.16,392055.44,326248
62 |
--------------------------------------------------------------------------------
/nodes.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import yaml
3 | from pocketflow import Node, BatchNode
4 | from utils.call_llm import call_llm
5 |
6 | def truncate_cell(value, max_length=50):
7 | """Truncate cell values for display purposes"""
8 | if pd.isna(value):
9 | return value
10 | str_value = str(value)
11 | if len(str_value) > max_length:
12 | return str_value[:max_length] + "..."
13 | return str_value
14 |
15 |
16 | class DuplicateDetectionNode(Node):
17 | def prep(self, shared):
18 | df = shared["dataframe"]
19 |
20 | # Find duplicate rows
21 | duplicate_rows = df[df.duplicated(keep=False)]
22 | duplicate_count = len(duplicate_rows) - len(duplicate_rows.drop_duplicates())
23 | duplicate_percentage = (duplicate_count / len(df)) * 100 if len(df) > 0 else 0
24 |
25 | # Get sample of duplicate rows for LLM analysis
26 | sample_duplicates = ""
27 | if duplicate_count > 0:
28 | sample_df = duplicate_rows.head(10).applymap(truncate_cell)
29 | sample_duplicates = sample_df.to_csv(index=False, quoting=1)
30 |
31 | # Get basic table info for context
32 | table_sample = df.head(5).applymap(truncate_cell).to_csv(index=False, quoting=1)
33 |
34 | return {
35 | "duplicate_count": duplicate_count,
36 | "duplicate_percentage": duplicate_percentage,
37 | "total_rows": len(df),
38 | "sample_duplicates": sample_duplicates,
39 | "table_sample": table_sample
40 | }
41 |
42 | def exec(self, prep_res):
43 | if prep_res["duplicate_count"] == 0:
44 | return {
45 | "should_remove": False,
46 | "analysis": "No duplicate rows found in the dataset."
47 | }
48 |
49 | prompt = f"""
50 | You have a table with {prep_res["total_rows"]} total rows and {prep_res["duplicate_count"]} duplicate rows ({prep_res["duplicate_percentage"]:.2f}%).
51 |
52 | Sample of the table:
53 | {prep_res["table_sample"]}
54 |
55 | Sample duplicate rows:
56 | {prep_res["sample_duplicates"]}
57 |
58 | Analyze these duplicates and decide whether they should be removed.
59 |
60 | Return in YAML format:
61 | ```yaml
62 | should_remove: true/false
63 | analysis: "Brief analysis explaining why duplicates should/shouldn't be removed"
64 | ```
65 | """
66 |
67 | response = call_llm(prompt)
68 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
69 | result = yaml.safe_load(yaml_str)
70 |
71 | assert "should_remove" in result
72 | assert "analysis" in result
73 | assert isinstance(result["should_remove"], bool)
74 | assert isinstance(result["analysis"], str)
75 |
76 | return result
77 |
78 | def post(self, shared, prep_res, exec_res):
79 | shared["profile_results"]["duplicates"] = {
80 | "count": prep_res["duplicate_count"],
81 | "percentage": prep_res["duplicate_percentage"],
82 | "total_rows": prep_res["total_rows"],
83 | "should_remove": exec_res["should_remove"],
84 | "analysis": exec_res["analysis"],
85 | "sample_rows": prep_res["sample_duplicates"]
86 | }
87 |
88 | class TableSummaryNode(Node):
89 | def prep(self, shared):
90 | df = shared["dataframe"]
91 |
92 | # Create a sample for LLM analysis
93 | sample_df = df.head(50).applymap(truncate_cell)
94 | sample_data = sample_df.to_csv(index=False, quoting=1)
95 |
96 | # Basic info
97 | column_names = list(df.columns)
98 | row_count = len(df)
99 |
100 | return {
101 | "sample_data": sample_data,
102 | "column_names": column_names,
103 | "row_count": row_count
104 | }
105 |
106 | def exec(self, prep_res):
107 | columns_str = ", ".join(prep_res["column_names"])
108 |
109 | prompt = f"""
110 | You have a table with {prep_res["row_count"]} rows and the following columns: {columns_str}
111 |
112 | Sample data:
113 | {prep_res["sample_data"]}
114 |
115 | Task: Summarize what this table represents.
116 | - Highlight: Include and highlight ALL column names as **Column_Name**
117 | - Structure: Start with the big picture, then explain how columns are related
118 | - Requirement: ALL column names must be mentioned and **highlighted**. Use exact column names (case sensitive)
119 | - Style: Use a few short sentences with simple words
120 |
121 | Example: "The table contains information about ... with **Customer_ID**, **Order_Date**, and **Amount**..."
122 |
123 | Your summary:
124 | """
125 |
126 | return call_llm(prompt)
127 |
128 | def post(self, shared, prep_res, exec_res):
129 | shared["profile_results"]["table_summary"] = exec_res
130 | return "default"
131 |
132 | class ColumnDescriptionNode(BatchNode):
133 | def prep(self, shared):
134 | df = shared["dataframe"]
135 | columns = list(df.columns)
136 |
137 | # Process columns in chunks of 10
138 | chunks = []
139 | for i in range(0, len(columns), 10):
140 | chunk_columns = columns[i:i + 10]
141 | chunk_df = df[chunk_columns].head(5).applymap(truncate_cell)
142 | chunk_sample = chunk_df.to_csv(index=False, quoting=1)
143 | chunks.append((chunk_columns, chunk_sample))
144 |
145 | return chunks
146 |
147 | def exec(self, chunk_data):
148 | chunk_columns, chunk_sample = chunk_data
149 |
150 | prompt = f"""
151 | You have the following table columns and sample data:
152 | {chunk_sample}
153 |
154 | For each column, provide a short description and suggest a better name if needed.
155 |
156 | Return in YAML format:
157 | ```yaml
158 | {chunk_columns[0]}:
159 | description: "Short description"
160 | suggested_name: "new_column_name"
161 | ...
162 | ```
163 | """
164 |
165 | response = call_llm(prompt)
166 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
167 | result = yaml.safe_load(yaml_str)
168 |
169 | # Validate all columns are present with required fields
170 | for col in chunk_columns:
171 | assert col in result, f"Column {col} missing from result"
172 | assert "description" in result[col], f"Description missing for {col}"
173 | assert "suggested_name" in result[col], f"Suggested name missing for {col}"
174 | assert isinstance(result[col]["description"], str)
175 | assert isinstance(result[col]["suggested_name"], str)
176 |
177 | return result
178 |
179 | def post(self, shared, prep_res, exec_res_list):
180 | # Combine results from all chunks
181 | all_descriptions = {}
182 | for chunk_result in exec_res_list:
183 | all_descriptions.update(chunk_result)
184 |
185 | # Convert to the expected format (now already in the right structure from YAML)
186 | shared["profile_results"]["column_descriptions"] = all_descriptions
187 | return "default"
188 |
189 | class DataTypeAnalysisNode(Node):
190 | def prep(self, shared):
191 | df = shared["dataframe"]
192 |
193 | # Get current data types
194 | current_types = {col: str(df[col].dtype) for col in df.columns}
195 |
196 | # Get sample data
197 | sample_df = df.head(10).applymap(truncate_cell)
198 | sample_data = sample_df.to_csv(index=False, quoting=1)
199 |
200 | return {
201 | "sample_data": sample_data,
202 | "current_types": current_types,
203 | "columns": list(df.columns)
204 | }
205 |
206 | def exec(self, prep_res):
207 | types_info = "\n".join([f"{col}: currently {dtype}" for col, dtype in prep_res["current_types"].items()])
208 | valid_types = ["int64", "float64", "object", "datetime64", "bool", "category"]
209 |
210 | prompt = f"""
211 | You have the following table with current data types:
212 | {types_info}
213 |
214 | Sample data:
215 | {prep_res["sample_data"]}
216 |
217 | For each column, suggest the most appropriate data type from: {valid_types}
218 |
219 | Return in YAML format:
220 | ```yaml
221 | column1:
222 | suggested_type: "int64"
223 | reason: "Contains only integer values"
224 | ...
225 | ```
226 | """
227 |
228 | response = call_llm(prompt)
229 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
230 | result = yaml.safe_load(yaml_str)
231 |
232 | # Validate all columns are present with required fields
233 | for col in prep_res["columns"]:
234 | assert col in result, f"Column {col} missing from result"
235 | assert "suggested_type" in result[col], f"Suggested type missing for {col}"
236 | assert "reason" in result[col], f"Reason missing for {col}"
237 | assert result[col]["suggested_type"] in valid_types, f"Invalid type for {col}: {result[col]['suggested_type']}"
238 | assert isinstance(result[col]["reason"], str)
239 |
240 | return result
241 |
242 | def post(self, shared, prep_res, exec_res):
243 | # Combine current and suggested types
244 | data_types = {}
245 | for col in prep_res["columns"]:
246 | data_types[col] = {
247 | "current_type": prep_res["current_types"][col],
248 | "suggested_type": exec_res[col]["suggested_type"],
249 | "reason": exec_res[col]["reason"]
250 | }
251 |
252 | shared["profile_results"]["data_types"] = data_types
253 | return "default"
254 |
255 | class MissingValuesAnalysisNode(Node):
256 | def prep(self, shared):
257 | df = shared["dataframe"]
258 |
259 | # Calculate missing values
260 | missing_info = {}
261 | for col in df.columns:
262 | missing_count = df[col].isna().sum()
263 | if missing_count > 0:
264 | missing_percentage = (missing_count / len(df)) * 100
265 | missing_info[col] = {
266 | "count": missing_count,
267 | "percentage": missing_percentage
268 | }
269 |
270 | # Get sample data
271 | sample_df = df.head(10).applymap(truncate_cell)
272 | sample_data = sample_df.to_csv(index=False, quoting=1)
273 |
274 | return {
275 | "missing_info": missing_info,
276 | "sample_data": sample_data,
277 | "total_rows": len(df)
278 | }
279 |
280 | def exec(self, prep_res):
281 | if not prep_res["missing_info"]:
282 | return {
283 | "reasoning": "No missing values found in any columns.",
284 | "columns_analysis": {}
285 | }
286 |
287 | missing_desc = "\n".join([
288 | f"{col}: {info['count']} missing ({info['percentage']:.1f}%)"
289 | for col, info in prep_res["missing_info"].items()
290 | ])
291 |
292 | prompt = f"""
293 | You have a table with the following missing values:
294 | {missing_desc}
295 |
296 | Sample data for context:
297 | {prep_res["sample_data"]}
298 |
299 | For each column with missing values, determine if missing values are meaningful or problematic.
300 |
301 | Return in YAML format:
302 | ```yaml
303 | overall_analysis: "Brief overall analysis"
304 | columns:
305 | column_name:
306 | is_meaningful: true/false
307 | reason: "Brief explanation"
308 | ...
309 | ```
310 | """
311 |
312 | response = call_llm(prompt)
313 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
314 | result = yaml.safe_load(yaml_str)
315 |
316 | # Validate structure
317 | assert "overall_analysis" in result
318 | assert "columns" in result
319 | assert isinstance(result["overall_analysis"], str)
320 | assert isinstance(result["columns"], dict)
321 |
322 | # Validate each column analysis
323 | for col in prep_res["missing_info"].keys():
324 | assert col in result["columns"], f"Missing analysis for column {col}"
325 | assert "is_meaningful" in result["columns"][col]
326 | assert "reason" in result["columns"][col]
327 | assert isinstance(result["columns"][col]["is_meaningful"], bool)
328 | assert isinstance(result["columns"][col]["reason"], str)
329 |
330 | return result
331 |
332 | def post(self, shared, prep_res, exec_res):
333 | missing_values = {}
334 |
335 | # Process columns with missing values
336 | for col, info in prep_res["missing_info"].items():
337 | analysis = exec_res["columns"][col]
338 | missing_values[col] = {
339 | "count": info["count"],
340 | "percentage": info["percentage"],
341 | "is_meaningful": analysis["is_meaningful"],
342 | "reason": analysis["reason"]
343 | }
344 |
345 | # Add columns with no missing values
346 | df = shared["dataframe"]
347 | for col in df.columns:
348 | if col not in missing_values:
349 | missing_values[col] = {
350 | "count": 0,
351 | "percentage": 0.0,
352 | "is_meaningful": True,
353 | "reason": "No missing values"
354 | }
355 |
356 | shared["profile_results"]["missing_values"] = missing_values
357 | shared["profile_results"]["missing_analysis"] = exec_res["overall_analysis"]
358 | return "default"
359 |
360 | class UniquenessAnalysisNode(Node):
361 | def prep(self, shared):
362 | df = shared["dataframe"]
363 |
364 | # Calculate uniqueness for each column
365 | uniqueness_info = {}
366 | for col in df.columns:
367 | unique_count = df[col].nunique()
368 | total_count = len(df)
369 | unique_percentage = (unique_count / total_count) * 100 if total_count > 0 else 0
370 |
371 | uniqueness_info[col] = {
372 | "unique_count": unique_count,
373 | "total_count": total_count,
374 | "unique_percentage": unique_percentage
375 | }
376 |
377 | # Get sample data and table summary for context
378 | sample_df = df.head(10).applymap(truncate_cell)
379 | sample_data = sample_df.to_csv(index=False, quoting=1)
380 | table_summary = shared["profile_results"].get("table_summary", "")
381 |
382 | # Get highly unique columns (>90% unique)
383 | highly_unique = {col: info for col, info in uniqueness_info.items()
384 | if info["unique_percentage"] > 90}
385 |
386 | return {
387 | "uniqueness_info": uniqueness_info,
388 | "highly_unique": highly_unique,
389 | "sample_data": sample_data,
390 | "table_summary": table_summary
391 | }
392 |
393 | def exec(self, prep_res):
394 | if not prep_res["highly_unique"]:
395 | return {
396 | "reasoning": "No columns found that could serve as candidate keys.",
397 | "candidate_keys": {}
398 | }
399 |
400 | highly_unique_desc = "\n".join([
401 | f"{col}: {info['unique_count']}/{info['total_count']} unique ({info['unique_percentage']:.1f}%)"
402 | for col, info in prep_res["highly_unique"].items()
403 | ])
404 |
405 | prompt = f"""
406 | Table context: {prep_res["table_summary"]}
407 |
408 | Sample data:
409 | {prep_res["sample_data"]}
410 |
411 | The following columns have high uniqueness:
412 | {highly_unique_desc}
413 |
414 | Analyze which columns could serve as candidate keys (unique identifiers) for this table.
415 | Consider:
416 | - What each row represents in this table
417 | - Whether the column values should be unique across all rows
418 | - Avoid continuous numerical values (like temperatures, prices) that happen to be unique in the sample
419 |
420 | Return in YAML format:
421 | ```yaml
422 | reasoning: "Analysis of which columns can serve as identifiers..."
423 | candidate_keys:
424 | column_name:
425 | is_candidate_key: true/false
426 | explanation: "Why this column is/isn't a good candidate key"
427 | ...
428 | ```
429 | """
430 |
431 | response = call_llm(prompt)
432 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
433 | return yaml.safe_load(yaml_str)
434 |
435 | def post(self, shared, prep_res, exec_res):
436 | uniqueness = {}
437 |
438 | for col, info in prep_res["uniqueness_info"].items():
439 | candidate_analysis = exec_res.get("candidate_keys", {}).get(col, {})
440 | uniqueness[col] = {
441 | "unique_count": info["unique_count"],
442 | "unique_percentage": info["unique_percentage"],
443 | "is_candidate_key": candidate_analysis.get("is_candidate_key", False),
444 | "explanation": candidate_analysis.get("explanation", "")
445 | }
446 |
447 | shared["profile_results"]["uniqueness"] = uniqueness
448 | shared["profile_results"]["uniqueness_reasoning"] = exec_res.get("reasoning", "")
449 | return "default"
450 |
451 | class UnusualValuesDetectionNode(BatchNode):
452 | def prep(self, shared):
453 | df = shared["dataframe"]
454 | columns = list(df.columns)
455 |
456 | # Create analysis tasks for each column
457 | column_tasks = []
458 | for col in columns:
459 | # Get sample of distinct values (up to 1000 for inspection)
460 | sample_values = df[col].dropna().drop_duplicates().head(1000)
461 | sample_list = [truncate_cell(val, 100) for val in sample_values]
462 |
463 | column_tasks.append({
464 | "column_name": col,
465 | "sample_values": sample_list,
466 | "data_type": str(df[col].dtype)
467 | })
468 |
469 | return column_tasks
470 |
471 | def exec(self, column_task):
472 | col_name = column_task["column_name"]
473 | sample_values = column_task["sample_values"]
474 | data_type = column_task["data_type"]
475 |
476 | if not sample_values:
477 | return {
478 | "column_name": col_name,
479 | "has_unusual": False,
480 | "explanation": "No values to analyze (all missing)"
481 | }
482 |
483 | values_str = ", ".join([f"'{val}'" for val in sample_values[:15]])
484 |
485 | prompt = f"""
486 | Column "{col_name}" (type: {data_type}) has the following sample values:
487 | {values_str}
488 |
489 | Check if there are any unusual values that seem wrong or inconsistent.
490 |
491 | Return in YAML format:
492 | ```yaml
493 | has_unusual: true/false
494 | explanation: "Brief explanation of findings"
495 | ```
496 | """
497 |
498 | response = call_llm(prompt)
499 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
500 | result = yaml.safe_load(yaml_str)
501 |
502 | # Validate structure
503 | assert "has_unusual" in result
504 | assert "explanation" in result
505 | assert isinstance(result["has_unusual"], bool)
506 | assert isinstance(result["explanation"], str)
507 |
508 | result["column_name"] = col_name
509 | return result
510 |
511 | def post(self, shared, prep_res, exec_res_list):
512 | unusual_values = {}
513 |
514 | for result in exec_res_list:
515 | col_name = result["column_name"]
516 | unusual_values[col_name] = {
517 | "has_unusual": result["has_unusual"],
518 | "explanation": result["explanation"]
519 | }
520 |
521 | shared["profile_results"]["unusual_values"] = unusual_values
522 | return "default"
523 |
524 | class GenerateReportNode(Node):
525 | def prep(self, shared):
526 | return shared["profile_results"]
527 |
528 | def exec(self, profile_results):
529 | # Generate a comprehensive report
530 | report_sections = []
531 |
532 | # Title
533 | report_sections.append("# Data Profiling Report\n")
534 |
535 | # Table Summary
536 | if "table_summary" in profile_results:
537 | report_sections.append("## Table Summary")
538 | report_sections.append(profile_results["table_summary"])
539 | report_sections.append("")
540 |
541 | # Duplicates
542 | if "duplicates" in profile_results:
543 | dup = profile_results["duplicates"]
544 | report_sections.append("## Duplicate Analysis")
545 | report_sections.append(f"- **Total rows**: {dup['total_rows']}")
546 | report_sections.append(f"- **Duplicate rows**: {dup['count']} ({dup['percentage']:.2f}%)")
547 | report_sections.append(f"- **Should remove**: {dup['should_remove']}")
548 | report_sections.append(f"- **Analysis**: {dup['analysis']}")
549 | report_sections.append("")
550 |
551 | # Column Descriptions
552 | if "column_descriptions" in profile_results:
553 | report_sections.append("## Column Descriptions")
554 | for col, info in profile_results["column_descriptions"].items():
555 | suggested = f" → *{info['suggested_name']}*" if info['suggested_name'] != col else ""
556 | report_sections.append(f"- **{col}**{suggested}: {info['description']}")
557 | report_sections.append("")
558 |
559 | # Data Types
560 | if "data_types" in profile_results:
561 | report_sections.append("## Data Type Analysis")
562 | changes_found = False
563 | for col, info in profile_results["data_types"].items():
564 | if info['suggested_type'] != info['current_type']:
565 | report_sections.append(f"- **{col}**: {info['current_type']} → *{info['suggested_type']}* ({info['reason']})")
566 | changes_found = True
567 | if not changes_found:
568 | report_sections.append("- All data types are appropriate")
569 | report_sections.append("")
570 |
571 | # Missing Values
572 | if "missing_values" in profile_results:
573 | report_sections.append("## Missing Values Analysis")
574 | if "missing_analysis" in profile_results:
575 | report_sections.append(f"**Overview**: {profile_results['missing_analysis']}")
576 | report_sections.append("")
577 |
578 | problematic_missing = []
579 | meaningful_missing = []
580 |
581 | for col, info in profile_results["missing_values"].items():
582 | if info['count'] > 0:
583 | entry = f"**{col}**: {info['count']} missing ({info['percentage']:.1f}%) - {info['reason']}"
584 | if info['is_meaningful']:
585 | meaningful_missing.append(entry)
586 | else:
587 | problematic_missing.append(entry)
588 |
589 | if problematic_missing:
590 | report_sections.append("### Problematic Missing Values")
591 | for entry in problematic_missing:
592 | report_sections.append(f"- {entry}")
593 | report_sections.append("")
594 |
595 | if meaningful_missing:
596 | report_sections.append("### Likely Meaningful Missing Values")
597 | for entry in meaningful_missing:
598 | report_sections.append(f"- {entry}")
599 | report_sections.append("")
600 |
601 | # Uniqueness
602 | if "uniqueness" in profile_results:
603 | report_sections.append("## Uniqueness Analysis")
604 | candidate_keys = []
605 | highly_unique = []
606 |
607 | for col, info in profile_results["uniqueness"].items():
608 | if info['is_candidate_key']:
609 | candidate_keys.append(f"**{col}**: {info['explanation']}")
610 | elif info['unique_percentage'] > 50:
611 | highly_unique.append(f"**{col}**: {info['unique_percentage']:.1f}% unique")
612 |
613 | if candidate_keys:
614 | report_sections.append("### Candidate Key Columns")
615 | for key in candidate_keys:
616 | report_sections.append(f"- {key}")
617 | report_sections.append("")
618 |
619 | if highly_unique:
620 | report_sections.append("### Highly Unique Columns")
621 | for col in highly_unique:
622 | report_sections.append(f"- {col}")
623 | report_sections.append("")
624 |
625 | # Unusual Values
626 | if "unusual_values" in profile_results:
627 | report_sections.append("## Unusual Values Detection")
628 | unusual_found = []
629 |
630 | for col, info in profile_results["unusual_values"].items():
631 | if info['has_unusual']:
632 | unusual_found.append(f"**{col}**: {info['explanation']}")
633 |
634 | if unusual_found:
635 | for finding in unusual_found:
636 | report_sections.append(f"- {finding}")
637 | else:
638 | report_sections.append("- No unusual values detected")
639 | report_sections.append("")
640 |
641 | return "\n".join(report_sections)
642 |
643 | def post(self, shared, prep_res, exec_res):
644 | shared["final_report"] = exec_res
645 | print("Data profiling complete! Report generated.")
646 | return "default"
--------------------------------------------------------------------------------
/.clinerules:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: "Agentic Coding"
4 | ---
5 |
6 | # Agentic Coding: Humans Design, Agents code!
7 |
8 | > If you are an AI agent involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification.
9 | {: .warning }
10 |
11 | ## Agentic Coding Steps
12 |
13 | Agentic Coding should be a collaboration between Human System Design and Agent Implementation:
14 |
15 | | Steps | Human | AI | Comment |
16 | |:-----------------------|:----------:|:---------:|:------------------------------------------------------------------------|
17 | | 1. Requirements | ★★★ High | ★☆☆ Low | Humans understand the requirements and context. |
18 | | 2. Flow | ★★☆ Medium | ★★☆ Medium | Humans specify the high-level design, and the AI fills in the details. |
19 | | 3. Utilities | ★★☆ Medium | ★★☆ Medium | Humans provide available external APIs and integrations, and the AI helps with implementation. |
20 | | 4. Data | ★☆☆ Low | ★★★ High | AI designs the data schema, and humans verify. |
21 | | 5. Node | ★☆☆ Low | ★★★ High | The AI helps design the node based on the flow. |
22 | | 6. Implementation | ★☆☆ Low | ★★★ High | The AI implements the flow based on the design. |
23 | | 7. Optimization | ★★☆ Medium | ★★☆ Medium | Humans evaluate the results, and the AI helps optimize. |
24 | | 8. Reliability | ★☆☆ Low | ★★★ High | The AI writes test cases and addresses corner cases. |
25 |
26 | 1. **Requirements**: Clarify the requirements for your project, and evaluate whether an AI system is a good fit.
27 | - Understand AI systems' strengths and limitations:
28 | - **Good for**: Routine tasks requiring common sense (filling forms, replying to emails)
29 | - **Good for**: Creative tasks with well-defined inputs (building slides, writing SQL)
30 | - **Not good for**: Ambiguous problems requiring complex decision-making (business strategy, startup planning)
31 | - **Keep It User-Centric:** Explain the "problem" from the user's perspective rather than just listing features.
32 | - **Balance complexity vs. impact**: Aim to deliver the highest value features with minimal complexity early.
33 |
34 | 2. **Flow Design**: Outline at a high level, describe how your AI system orchestrates nodes.
35 | - Identify applicable design patterns (e.g., [Map Reduce](./design_pattern/mapreduce.md), [Agent](./design_pattern/agent.md), [RAG](./design_pattern/rag.md)).
36 | - For each node in the flow, start with a high-level one-line description of what it does.
37 | - If using **Map Reduce**, specify how to map (what to split) and how to reduce (how to combine).
38 | - If using **Agent**, specify what are the inputs (context) and what are the possible actions.
39 | - If using **RAG**, specify what to embed, noting that there's usually both offline (indexing) and online (retrieval) workflows.
40 | - Outline the flow and draw it in a mermaid diagram. For example:
41 | ```mermaid
42 | flowchart LR
43 | start[Start] --> batch[Batch]
44 | batch --> check[Check]
45 | check -->|OK| process
46 | check -->|Error| fix[Fix]
47 | fix --> check
48 |
49 | subgraph process[Process]
50 | step1[Step 1] --> step2[Step 2]
51 | end
52 |
53 | process --> endNode[End]
54 | ```
55 | - > **If Humans can't specify the flow, AI Agents can't automate it!** Before building an LLM system, thoroughly understand the problem and potential solution by manually solving example inputs to develop intuition.
56 | {: .best-practice }
57 |
58 | 3. **Utilities**: Based on the Flow Design, identify and implement necessary utility functions.
59 | - Think of your AI system as the brain. It needs a body—these *external utility functions*—to interact with the real world:
60 |

61 |
62 | - Reading inputs (e.g., retrieving Slack messages, reading emails)
63 | - Writing outputs (e.g., generating reports, sending emails)
64 | - Using external tools (e.g., calling LLMs, searching the web)
65 | - **NOTE**: *LLM-based tasks* (e.g., summarizing text, analyzing sentiment) are **NOT** utility functions; rather, they are *core functions* internal in the AI system.
66 | - For each utility function, implement it and write a simple test.
67 | - Document their input/output, as well as why they are necessary. For example:
68 | - `name`: `get_embedding` (`utils/get_embedding.py`)
69 | - `input`: `str`
70 | - `output`: a vector of 3072 floats
71 | - `necessity`: Used by the second node to embed text
72 | - Example utility implementation:
73 | ```python
74 | # utils/call_llm.py
75 | from openai import OpenAI
76 |
77 | def call_llm(prompt):
78 | client = OpenAI(api_key="YOUR_API_KEY_HERE")
79 | r = client.chat.completions.create(
80 | model="gpt-4o",
81 | messages=[{"role": "user", "content": prompt}]
82 | )
83 | return r.choices[0].message.content
84 |
85 | if __name__ == "__main__":
86 | prompt = "What is the meaning of life?"
87 | print(call_llm(prompt))
88 | ```
89 | - > **Sometimes, design Utilities before Flow:** For example, for an LLM project to automate a legacy system, the bottleneck will likely be the available interface to that system. Start by designing the hardest utilities for interfacing, and then build the flow around them.
90 | {: .best-practice }
91 | - > **Avoid Exception Handling in Utilities**: If a utility function is called from a Node's `exec()` method, avoid using `try...except` blocks within the utility. Let the Node's built-in retry mechanism handle failures.
92 | {: .warning }
93 |
94 | 4. **Data Design**: Design the shared store that nodes will use to communicate.
95 | - One core design principle for PocketFlow is to use a well-designed [shared store](./core_abstraction/communication.md)—a data contract that all nodes agree upon to retrieve and store data.
96 | - For simple systems, use an in-memory dictionary.
97 | - For more complex systems or when persistence is required, use a database.
98 | - **Don't Repeat Yourself**: Use in-memory references or foreign keys.
99 | - Example shared store design:
100 | ```python
101 | shared = {
102 | "user": {
103 | "id": "user123",
104 | "context": { # Another nested dict
105 | "weather": {"temp": 72, "condition": "sunny"},
106 | "location": "San Francisco"
107 | }
108 | },
109 | "results": {} # Empty dict to store outputs
110 | }
111 | ```
112 |
113 | 5. **Node Design**: Plan how each node will read and write data, and use utility functions.
114 | - For each [Node](./core_abstraction/node.md), describe its type, how it reads and writes data, and which utility function it uses. Keep it specific but high-level without codes. For example:
115 | - `type`: Regular (or Batch, or Async)
116 | - `prep`: Read "text" from the shared store
117 | - `exec`: Call the embedding utility function. **Avoid exception handling here**; let the Node's retry mechanism manage failures.
118 | - `post`: Write "embedding" to the shared store
119 |
120 | 6. **Implementation**: Implement the initial nodes and flows based on the design.
121 | - 🎉 If you've reached this step, humans have finished the design. Now *Agentic Coding* begins!
122 | - **"Keep it simple, stupid!"** Avoid complex features and full-scale type checking.
123 | - **FAIL FAST**! Leverage the built-in [Node](./core_abstraction/node.md) retry and fallback mechanisms to handle failures gracefully. This helps you quickly identify weak points in the system.
124 | - Add logging throughout the code to facilitate debugging.
125 |
126 | 7. **Optimization**:
127 | - **Use Intuition**: For a quick initial evaluation, human intuition is often a good start.
128 | - **Redesign Flow (Back to Step 3)**: Consider breaking down tasks further, introducing agentic decisions, or better managing input contexts.
129 | - If your flow design is already solid, move on to micro-optimizations:
130 | - **Prompt Engineering**: Use clear, specific instructions with examples to reduce ambiguity.
131 | - **In-Context Learning**: Provide robust examples for tasks that are difficult to specify with instructions alone.
132 |
133 | - > **You'll likely iterate a lot!** Expect to repeat Steps 3–6 hundreds of times.
134 | >
135 | > 
136 | {: .best-practice }
137 |
138 | 8. **Reliability**
139 | - **Node Retries**: Add checks in the node `exec` to ensure outputs meet requirements, and consider increasing `max_retries` and `wait` times.
140 | - **Logging and Visualization**: Maintain logs of all attempts and visualize node results for easier debugging.
141 | - **Self-Evaluation**: Add a separate node (powered by an LLM) to review outputs when results are uncertain.
142 |
143 | ## Example LLM Project File Structure
144 |
145 | ```
146 | my_project/
147 | ├── main.py
148 | ├── nodes.py
149 | ├── flow.py
150 | ├── utils/
151 | │ ├── __init__.py
152 | │ ├── call_llm.py
153 | │ └── search_web.py
154 | ├── requirements.txt
155 | └── docs/
156 | └── design.md
157 | ```
158 |
159 | - **`requirements.txt`**: Lists the Python dependencies for the project.
160 | ```
161 | PyYAML
162 | pocketflow
163 | ```
164 |
165 | - **`docs/design.md`**: Contains project documentation for each step above. This should be *high-level* and *no-code*.
166 | ~~~
167 | # Design Doc: Your Project Name
168 |
169 | > Please DON'T remove notes for AI
170 |
171 | ## Requirements
172 |
173 | > Notes for AI: Keep it simple and clear.
174 | > If the requirements are abstract, write concrete user stories
175 |
176 |
177 | ## Flow Design
178 |
179 | > Notes for AI:
180 | > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit.
181 | > 2. Present a concise, high-level description of the workflow.
182 |
183 | ### Applicable Design Pattern:
184 |
185 | 1. Map the file summary into chunks, then reduce these chunks into a final summary.
186 | 2. Agentic file finder
187 | - *Context*: The entire summary of the file
188 | - *Action*: Find the file
189 |
190 | ### Flow high-level Design:
191 |
192 | 1. **First Node**: This node is for ...
193 | 2. **Second Node**: This node is for ...
194 | 3. **Third Node**: This node is for ...
195 |
196 | ```mermaid
197 | flowchart TD
198 | firstNode[First Node] --> secondNode[Second Node]
199 | secondNode --> thirdNode[Third Node]
200 | ```
201 | ## Utility Functions
202 |
203 | > Notes for AI:
204 | > 1. Understand the utility function definition thoroughly by reviewing the doc.
205 | > 2. Include only the necessary utility functions, based on nodes in the flow.
206 |
207 | 1. **Call LLM** (`utils/call_llm.py`)
208 | - *Input*: prompt (str)
209 | - *Output*: response (str)
210 | - Generally used by most nodes for LLM tasks
211 |
212 | 2. **Embedding** (`utils/get_embedding.py`)
213 | - *Input*: str
214 | - *Output*: a vector of 3072 floats
215 | - Used by the second node to embed text
216 |
217 | ## Node Design
218 |
219 | ### Shared Store
220 |
221 | > Notes for AI: Try to minimize data redundancy
222 |
223 | The shared store structure is organized as follows:
224 |
225 | ```python
226 | shared = {
227 | "key": "value"
228 | }
229 | ```
230 |
231 | ### Node Steps
232 |
233 | > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow.
234 |
235 | 1. First Node
236 | - *Purpose*: Provide a short explanation of the node’s function
237 | - *Type*: Decide between Regular, Batch, or Async
238 | - *Steps*:
239 | - *prep*: Read "key" from the shared store
240 | - *exec*: Call the utility function
241 | - *post*: Write "key" to the shared store
242 |
243 | 2. Second Node
244 | ...
245 | ~~~
246 |
247 |
248 | - **`utils/`**: Contains all utility functions.
249 | - It's recommended to dedicate one Python file to each API call, for example `call_llm.py` or `search_web.py`.
250 | - Each file should also include a `main()` function to try that API call
251 | ```python
252 | from google import genai
253 | import os
254 |
255 | def call_llm(prompt: str) -> str:
256 | client = genai.Client(
257 | api_key=os.getenv("GEMINI_API_KEY", ""),
258 | )
259 | model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
260 | response = client.models.generate_content(model=model, contents=[prompt])
261 | return response.text
262 |
263 | if __name__ == "__main__":
264 | test_prompt = "Hello, how are you?"
265 |
266 | # First call - should hit the API
267 | print("Making call...")
268 | response1 = call_llm(test_prompt, use_cache=False)
269 | print(f"Response: {response1}")
270 | ```
271 |
272 | - **`nodes.py`**: Contains all the node definitions.
273 | ```python
274 | # nodes.py
275 | from pocketflow import Node
276 | from utils.call_llm import call_llm
277 |
278 | class GetQuestionNode(Node):
279 | def exec(self, _):
280 | # Get question directly from user input
281 | user_question = input("Enter your question: ")
282 | return user_question
283 |
284 | def post(self, shared, prep_res, exec_res):
285 | # Store the user's question
286 | shared["question"] = exec_res
287 | return "default" # Go to the next node
288 |
289 | class AnswerNode(Node):
290 | def prep(self, shared):
291 | # Read question from shared
292 | return shared["question"]
293 |
294 | def exec(self, question):
295 | # Call LLM to get the answer
296 | return call_llm(question)
297 |
298 | def post(self, shared, prep_res, exec_res):
299 | # Store the answer in shared
300 | shared["answer"] = exec_res
301 | ```
302 | - **`flow.py`**: Implements functions that create flows by importing node definitions and connecting them.
303 | ```python
304 | # flow.py
305 | from pocketflow import Flow
306 | from nodes import GetQuestionNode, AnswerNode
307 |
308 | def create_qa_flow():
309 | """Create and return a question-answering flow."""
310 | # Create nodes
311 | get_question_node = GetQuestionNode()
312 | answer_node = AnswerNode()
313 |
314 | # Connect nodes in sequence
315 | get_question_node >> answer_node
316 |
317 | # Create flow starting with input node
318 | return Flow(start=get_question_node)
319 | ```
320 | - **`main.py`**: Serves as the project's entry point.
321 | ```python
322 | # main.py
323 | from flow import create_qa_flow
324 |
325 | # Example main function
326 | # Please replace this with your own main function
327 | def main():
328 | shared = {
329 | "question": None, # Will be populated by GetQuestionNode from user input
330 | "answer": None # Will be populated by AnswerNode
331 | }
332 |
333 | # Create the flow and run it
334 | qa_flow = create_qa_flow()
335 | qa_flow.run(shared)
336 | print(f"Question: {shared['question']}")
337 | print(f"Answer: {shared['answer']}")
338 |
339 | if __name__ == "__main__":
340 | main()
341 | ```
342 |
343 | ================================================
344 | File: docs/index.md
345 | ================================================
346 | ---
347 | layout: default
348 | title: "Home"
349 | nav_order: 1
350 | ---
351 |
352 | # Pocket Flow
353 |
354 | A [100-line](https://github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*.
355 |
356 | - **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in.
357 | - **Expressive**: Everything you love from larger frameworks—([Multi-](./design_pattern/multi_agent.html))[Agents](./design_pattern/agent.html), [Workflow](./design_pattern/workflow.html), [RAG](./design_pattern/rag.html), and more.
358 | - **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications.
359 |
360 |
361 |

362 |
363 |
364 | ## Core Abstraction
365 |
366 | We model the LLM workflow as a **Graph + Shared Store**:
367 |
368 | - [Node](./core_abstraction/node.md) handles simple (LLM) tasks.
369 | - [Flow](./core_abstraction/flow.md) connects nodes through **Actions** (labeled edges).
370 | - [Shared Store](./core_abstraction/communication.md) enables communication between nodes within flows.
371 | - [Batch](./core_abstraction/batch.md) nodes/flows allow for data-intensive tasks.
372 | - [Async](./core_abstraction/async.md) nodes/flows allow waiting for asynchronous tasks.
373 | - [(Advanced) Parallel](./core_abstraction/parallel.md) nodes/flows handle I/O-bound tasks.
374 |
375 |
376 |

377 |
378 |
379 | ## Design Pattern
380 |
381 | From there, it’s easy to implement popular design patterns:
382 |
383 | - [Agent](./design_pattern/agent.md) autonomously makes decisions.
384 | - [Workflow](./design_pattern/workflow.md) chains multiple tasks into pipelines.
385 | - [RAG](./design_pattern/rag.md) integrates data retrieval with generation.
386 | - [Map Reduce](./design_pattern/mapreduce.md) splits data tasks into Map and Reduce steps.
387 | - [Structured Output](./design_pattern/structure.md) formats outputs consistently.
388 | - [(Advanced) Multi-Agents](./design_pattern/multi_agent.md) coordinate multiple agents.
389 |
390 |
391 |

392 |
393 |
394 | ## Utility Function
395 |
396 | We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*:
397 |
398 | - [LLM Wrapper](./utility_function/llm.md)
399 | - [Viz and Debug](./utility_function/viz.md)
400 | - [Web Search](./utility_function/websearch.md)
401 | - [Chunking](./utility_function/chunking.md)
402 | - [Embedding](./utility_function/embedding.md)
403 | - [Vector Databases](./utility_function/vector.md)
404 | - [Text-to-Speech](./utility_function/text_to_speech.md)
405 |
406 | **Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework:
407 | - *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs.
408 | - *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally.
409 | - *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in.
410 |
411 | ## Ready to build your Apps?
412 |
413 | Check out [Agentic Coding Guidance](./guide.md), the fastest way to develop LLM projects with Pocket Flow!
414 |
415 | ================================================
416 | File: docs/core_abstraction/async.md
417 | ================================================
418 | ---
419 | layout: default
420 | title: "(Advanced) Async"
421 | parent: "Core Abstraction"
422 | nav_order: 5
423 | ---
424 |
425 | # (Advanced) Async
426 |
427 | **Async** Nodes implement `prep_async()`, `exec_async()`, `exec_fallback_async()`, and/or `post_async()`. This is useful for:
428 |
429 | 1. **prep_async()**: For *fetching/reading data (files, APIs, DB)* in an I/O-friendly way.
430 | 2. **exec_async()**: Typically used for async LLM calls.
431 | 3. **post_async()**: For *awaiting user feedback*, *coordinating across multi-agents* or any additional async steps after `exec_async()`.
432 |
433 | **Note**: `AsyncNode` must be wrapped in `AsyncFlow`. `AsyncFlow` can also include regular (sync) nodes.
434 |
435 | ### Example
436 |
437 | ```python
438 | class SummarizeThenVerify(AsyncNode):
439 | async def prep_async(self, shared):
440 | # Example: read a file asynchronously
441 | doc_text = await read_file_async(shared["doc_path"])
442 | return doc_text
443 |
444 | async def exec_async(self, prep_res):
445 | # Example: async LLM call
446 | summary = await call_llm_async(f"Summarize: {prep_res}")
447 | return summary
448 |
449 | async def post_async(self, shared, prep_res, exec_res):
450 | # Example: wait for user feedback
451 | decision = await gather_user_feedback(exec_res)
452 | if decision == "approve":
453 | shared["summary"] = exec_res
454 | return "approve"
455 | return "deny"
456 |
457 | summarize_node = SummarizeThenVerify()
458 | final_node = Finalize()
459 |
460 | # Define transitions
461 | summarize_node - "approve" >> final_node
462 | summarize_node - "deny" >> summarize_node # retry
463 |
464 | flow = AsyncFlow(start=summarize_node)
465 |
466 | async def main():
467 | shared = {"doc_path": "document.txt"}
468 | await flow.run_async(shared)
469 | print("Final Summary:", shared.get("summary"))
470 |
471 | asyncio.run(main())
472 | ```
473 |
474 | ================================================
475 | File: docs/core_abstraction/batch.md
476 | ================================================
477 | ---
478 | layout: default
479 | title: "Batch"
480 | parent: "Core Abstraction"
481 | nav_order: 4
482 | ---
483 |
484 | # Batch
485 |
486 | **Batch** makes it easier to handle large inputs in one Node or **rerun** a Flow multiple times. Example use cases:
487 | - **Chunk-based** processing (e.g., splitting large texts).
488 | - **Iterative** processing over lists of input items (e.g., user queries, files, URLs).
489 |
490 | ## 1. BatchNode
491 |
492 | A **BatchNode** extends `Node` but changes `prep()` and `exec()`:
493 |
494 | - **`prep(shared)`**: returns an **iterable** (e.g., list, generator).
495 | - **`exec(item)`**: called **once** per item in that iterable.
496 | - **`post(shared, prep_res, exec_res_list)`**: after all items are processed, receives a **list** of results (`exec_res_list`) and returns an **Action**.
497 |
498 |
499 | ### Example: Summarize a Large File
500 |
501 | ```python
502 | class MapSummaries(BatchNode):
503 | def prep(self, shared):
504 | # Suppose we have a big file; chunk it
505 | content = shared["data"]
506 | chunk_size = 10000
507 | chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
508 | return chunks
509 |
510 | def exec(self, chunk):
511 | prompt = f"Summarize this chunk in 10 words: {chunk}"
512 | summary = call_llm(prompt)
513 | return summary
514 |
515 | def post(self, shared, prep_res, exec_res_list):
516 | combined = "\n".join(exec_res_list)
517 | shared["summary"] = combined
518 | return "default"
519 |
520 | map_summaries = MapSummaries()
521 | flow = Flow(start=map_summaries)
522 | flow.run(shared)
523 | ```
524 |
525 | ---
526 |
527 | ## 2. BatchFlow
528 |
529 | A **BatchFlow** runs a **Flow** multiple times, each time with different `params`. Think of it as a loop that replays the Flow for each parameter set.
530 |
531 | ### Example: Summarize Many Files
532 |
533 | ```python
534 | class SummarizeAllFiles(BatchFlow):
535 | def prep(self, shared):
536 | # Return a list of param dicts (one per file)
537 | filenames = list(shared["data"].keys()) # e.g., ["file1.txt", "file2.txt", ...]
538 | return [{"filename": fn} for fn in filenames]
539 |
540 | # Suppose we have a per-file Flow (e.g., load_file >> summarize >> reduce):
541 | summarize_file = SummarizeFile(start=load_file)
542 |
543 | # Wrap that flow into a BatchFlow:
544 | summarize_all_files = SummarizeAllFiles(start=summarize_file)
545 | summarize_all_files.run(shared)
546 | ```
547 |
548 | ### Under the Hood
549 | 1. `prep(shared)` returns a list of param dicts—e.g., `[{filename: "file1.txt"}, {filename: "file2.txt"}, ...]`.
550 | 2. The **BatchFlow** loops through each dict. For each one:
551 | - It merges the dict with the BatchFlow’s own `params`.
552 | - It calls `flow.run(shared)` using the merged result.
553 | 3. This means the sub-Flow is run **repeatedly**, once for every param dict.
554 |
555 | ---
556 |
557 | ## 3. Nested or Multi-Level Batches
558 |
559 | You can nest a **BatchFlow** in another **BatchFlow**. For instance:
560 | - **Outer** batch: returns a list of diretory param dicts (e.g., `{"directory": "/pathA"}`, `{"directory": "/pathB"}`, ...).
561 | - **Inner** batch: returning a list of per-file param dicts.
562 |
563 | At each level, **BatchFlow** merges its own param dict with the parent’s. By the time you reach the **innermost** node, the final `params` is the merged result of **all** parents in the chain. This way, a nested structure can keep track of the entire context (e.g., directory + file name) at once.
564 |
565 | ```python
566 |
567 | class FileBatchFlow(BatchFlow):
568 | def prep(self, shared):
569 | directory = self.params["directory"]
570 | # e.g., files = ["file1.txt", "file2.txt", ...]
571 | files = [f for f in os.listdir(directory) if f.endswith(".txt")]
572 | return [{"filename": f} for f in files]
573 |
574 | class DirectoryBatchFlow(BatchFlow):
575 | def prep(self, shared):
576 | directories = [ "/path/to/dirA", "/path/to/dirB"]
577 | return [{"directory": d} for d in directories]
578 |
579 | # MapSummaries have params like {"directory": "/path/to/dirA", "filename": "file1.txt"}
580 | inner_flow = FileBatchFlow(start=MapSummaries())
581 | outer_flow = DirectoryBatchFlow(start=inner_flow)
582 | ```
583 |
584 | ================================================
585 | File: docs/core_abstraction/communication.md
586 | ================================================
587 | ---
588 | layout: default
589 | title: "Communication"
590 | parent: "Core Abstraction"
591 | nav_order: 3
592 | ---
593 |
594 | # Communication
595 |
596 | Nodes and Flows **communicate** in 2 ways:
597 |
598 | 1. **Shared Store (for almost all the cases)**
599 |
600 | - A global data structure (often an in-mem dict) that all nodes can read ( `prep()`) and write (`post()`).
601 | - Great for data results, large content, or anything multiple nodes need.
602 | - You shall design the data structure and populate it ahead.
603 |
604 | - > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*! This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](./batch.md).
605 | {: .best-practice }
606 |
607 | 2. **Params (only for [Batch](./batch.md))**
608 | - Each node has a local, ephemeral `params` dict passed in by the **parent Flow**, used as an identifier for tasks. Parameter keys and values shall be **immutable**.
609 | - Good for identifiers like filenames or numeric IDs, in Batch mode.
610 |
611 | If you know memory management, think of the **Shared Store** like a **heap** (shared by all function calls), and **Params** like a **stack** (assigned by the caller).
612 |
613 | ---
614 |
615 | ## 1. Shared Store
616 |
617 | ### Overview
618 |
619 | A shared store is typically an in-mem dictionary, like:
620 | ```python
621 | shared = {"data": {}, "summary": {}, "config": {...}, ...}
622 | ```
623 |
624 | It can also contain local file handlers, DB connections, or a combination for persistence. We recommend deciding the data structure or DB schema first based on your app requirements.
625 |
626 | ### Example
627 |
628 | ```python
629 | class LoadData(Node):
630 | def post(self, shared, prep_res, exec_res):
631 | # We write data to shared store
632 | shared["data"] = "Some text content"
633 | return None
634 |
635 | class Summarize(Node):
636 | def prep(self, shared):
637 | # We read data from shared store
638 | return shared["data"]
639 |
640 | def exec(self, prep_res):
641 | # Call LLM to summarize
642 | prompt = f"Summarize: {prep_res}"
643 | summary = call_llm(prompt)
644 | return summary
645 |
646 | def post(self, shared, prep_res, exec_res):
647 | # We write summary to shared store
648 | shared["summary"] = exec_res
649 | return "default"
650 |
651 | load_data = LoadData()
652 | summarize = Summarize()
653 | load_data >> summarize
654 | flow = Flow(start=load_data)
655 |
656 | shared = {}
657 | flow.run(shared)
658 | ```
659 |
660 | Here:
661 | - `LoadData` writes to `shared["data"]`.
662 | - `Summarize` reads from `shared["data"]`, summarizes, and writes to `shared["summary"]`.
663 |
664 | ---
665 |
666 | ## 2. Params
667 |
668 | **Params** let you store *per-Node* or *per-Flow* config that doesn't need to live in the shared store. They are:
669 | - **Immutable** during a Node's run cycle (i.e., they don't change mid-`prep->exec->post`).
670 | - **Set** via `set_params()`.
671 | - **Cleared** and updated each time a parent Flow calls it.
672 |
673 | > Only set the uppermost Flow params because others will be overwritten by the parent Flow.
674 | >
675 | > If you need to set child node params, see [Batch](./batch.md).
676 | {: .warning }
677 |
678 | Typically, **Params** are identifiers (e.g., file name, page number). Use them to fetch the task you assigned or write to a specific part of the shared store.
679 |
680 | ### Example
681 |
682 | ```python
683 | # 1) Create a Node that uses params
684 | class SummarizeFile(Node):
685 | def prep(self, shared):
686 | # Access the node's param
687 | filename = self.params["filename"]
688 | return shared["data"].get(filename, "")
689 |
690 | def exec(self, prep_res):
691 | prompt = f"Summarize: {prep_res}"
692 | return call_llm(prompt)
693 |
694 | def post(self, shared, prep_res, exec_res):
695 | filename = self.params["filename"]
696 | shared["summary"][filename] = exec_res
697 | return "default"
698 |
699 | # 2) Set params
700 | node = SummarizeFile()
701 |
702 | # 3) Set Node params directly (for testing)
703 | node.set_params({"filename": "doc1.txt"})
704 | node.run(shared)
705 |
706 | # 4) Create Flow
707 | flow = Flow(start=node)
708 |
709 | # 5) Set Flow params (overwrites node params)
710 | flow.set_params({"filename": "doc2.txt"})
711 | flow.run(shared) # The node summarizes doc2, not doc1
712 | ```
713 |
714 | ================================================
715 | File: docs/core_abstraction/flow.md
716 | ================================================
717 | ---
718 | layout: default
719 | title: "Flow"
720 | parent: "Core Abstraction"
721 | nav_order: 2
722 | ---
723 |
724 | # Flow
725 |
726 | A **Flow** orchestrates a graph of Nodes. You can chain Nodes in a sequence or create branching depending on the **Actions** returned from each Node's `post()`.
727 |
728 | ## 1. Action-based Transitions
729 |
730 | Each Node's `post()` returns an **Action** string. By default, if `post()` doesn't return anything, we treat that as `"default"`.
731 |
732 | You define transitions with the syntax:
733 |
734 | 1. **Basic default transition**: `node_a >> node_b`
735 | This means if `node_a.post()` returns `"default"`, go to `node_b`.
736 | (Equivalent to `node_a - "default" >> node_b`)
737 |
738 | 2. **Named action transition**: `node_a - "action_name" >> node_b`
739 | This means if `node_a.post()` returns `"action_name"`, go to `node_b`.
740 |
741 | It's possible to create loops, branching, or multi-step flows.
742 |
743 | ## 2. Creating a Flow
744 |
745 | A **Flow** begins with a **start** node. You call `Flow(start=some_node)` to specify the entry point. When you call `flow.run(shared)`, it executes the start node, looks at its returned Action from `post()`, follows the transition, and continues until there's no next node.
746 |
747 | ### Example: Simple Sequence
748 |
749 | Here's a minimal flow of two nodes in a chain:
750 |
751 | ```python
752 | node_a >> node_b
753 | flow = Flow(start=node_a)
754 | flow.run(shared)
755 | ```
756 |
757 | - When you run the flow, it executes `node_a`.
758 | - Suppose `node_a.post()` returns `"default"`.
759 | - The flow then sees `"default"` Action is linked to `node_b` and runs `node_b`.
760 | - `node_b.post()` returns `"default"` but we didn't define `node_b >> something_else`. So the flow ends there.
761 |
762 | ### Example: Branching & Looping
763 |
764 | Here's a simple expense approval flow that demonstrates branching and looping. The `ReviewExpense` node can return three possible Actions:
765 |
766 | - `"approved"`: expense is approved, move to payment processing
767 | - `"needs_revision"`: expense needs changes, send back for revision
768 | - `"rejected"`: expense is denied, finish the process
769 |
770 | We can wire them like this:
771 |
772 | ```python
773 | # Define the flow connections
774 | review - "approved" >> payment # If approved, process payment
775 | review - "needs_revision" >> revise # If needs changes, go to revision
776 | review - "rejected" >> finish # If rejected, finish the process
777 |
778 | revise >> review # After revision, go back for another review
779 | payment >> finish # After payment, finish the process
780 |
781 | flow = Flow(start=review)
782 | ```
783 |
784 | Let's see how it flows:
785 |
786 | 1. If `review.post()` returns `"approved"`, the expense moves to the `payment` node
787 | 2. If `review.post()` returns `"needs_revision"`, it goes to the `revise` node, which then loops back to `review`
788 | 3. If `review.post()` returns `"rejected"`, it moves to the `finish` node and stops
789 |
790 | ```mermaid
791 | flowchart TD
792 | review[Review Expense] -->|approved| payment[Process Payment]
793 | review -->|needs_revision| revise[Revise Report]
794 | review -->|rejected| finish[Finish Process]
795 |
796 | revise --> review
797 | payment --> finish
798 | ```
799 |
800 | ### Running Individual Nodes vs. Running a Flow
801 |
802 | - `node.run(shared)`: Just runs that node alone (calls `prep->exec->post()`), returns an Action.
803 | - `flow.run(shared)`: Executes from the start node, follows Actions to the next node, and so on until the flow can't continue.
804 |
805 | > `node.run(shared)` **does not** proceed to the successor.
806 | > This is mainly for debugging or testing a single node.
807 | >
808 | > Always use `flow.run(...)` in production to ensure the full pipeline runs correctly.
809 | {: .warning }
810 |
811 | ## 3. Nested Flows
812 |
813 | A **Flow** can act like a Node, which enables powerful composition patterns. This means you can:
814 |
815 | 1. Use a Flow as a Node within another Flow's transitions.
816 | 2. Combine multiple smaller Flows into a larger Flow for reuse.
817 | 3. Node `params` will be a merging of **all** parents' `params`.
818 |
819 | ### Flow's Node Methods
820 |
821 | A **Flow** is also a **Node**, so it will run `prep()` and `post()`. However:
822 |
823 | - It **won't** run `exec()`, as its main logic is to orchestrate its nodes.
824 | - `post()` always receives `None` for `exec_res` and should instead get the flow execution results from the shared store.
825 |
826 | ### Basic Flow Nesting
827 |
828 | Here's how to connect a flow to another node:
829 |
830 | ```python
831 | # Create a sub-flow
832 | node_a >> node_b
833 | subflow = Flow(start=node_a)
834 |
835 | # Connect it to another node
836 | subflow >> node_c
837 |
838 | # Create the parent flow
839 | parent_flow = Flow(start=subflow)
840 | ```
841 |
842 | When `parent_flow.run()` executes:
843 | 1. It starts `subflow`
844 | 2. `subflow` runs through its nodes (`node_a->node_b`)
845 | 3. After `subflow` completes, execution continues to `node_c`
846 |
847 | ### Example: Order Processing Pipeline
848 |
849 | Here's a practical example that breaks down order processing into nested flows:
850 |
851 | ```python
852 | # Payment processing sub-flow
853 | validate_payment >> process_payment >> payment_confirmation
854 | payment_flow = Flow(start=validate_payment)
855 |
856 | # Inventory sub-flow
857 | check_stock >> reserve_items >> update_inventory
858 | inventory_flow = Flow(start=check_stock)
859 |
860 | # Shipping sub-flow
861 | create_label >> assign_carrier >> schedule_pickup
862 | shipping_flow = Flow(start=create_label)
863 |
864 | # Connect the flows into a main order pipeline
865 | payment_flow >> inventory_flow >> shipping_flow
866 |
867 | # Create the master flow
868 | order_pipeline = Flow(start=payment_flow)
869 |
870 | # Run the entire pipeline
871 | order_pipeline.run(shared_data)
872 | ```
873 |
874 | This creates a clean separation of concerns while maintaining a clear execution path:
875 |
876 | ```mermaid
877 | flowchart LR
878 | subgraph order_pipeline[Order Pipeline]
879 | subgraph paymentFlow["Payment Flow"]
880 | A[Validate Payment] --> B[Process Payment] --> C[Payment Confirmation]
881 | end
882 |
883 | subgraph inventoryFlow["Inventory Flow"]
884 | D[Check Stock] --> E[Reserve Items] --> F[Update Inventory]
885 | end
886 |
887 | subgraph shippingFlow["Shipping Flow"]
888 | G[Create Label] --> H[Assign Carrier] --> I[Schedule Pickup]
889 | end
890 |
891 | paymentFlow --> inventoryFlow
892 | inventoryFlow --> shippingFlow
893 | end
894 | ```
895 |
896 | ================================================
897 | File: docs/core_abstraction/node.md
898 | ================================================
899 | ---
900 | layout: default
901 | title: "Node"
902 | parent: "Core Abstraction"
903 | nav_order: 1
904 | ---
905 |
906 | # Node
907 |
908 | A **Node** is the smallest building block. Each Node has 3 steps `prep->exec->post`:
909 |
910 |
911 |

912 |
913 |
914 | 1. `prep(shared)`
915 | - **Read and preprocess data** from `shared` store.
916 | - Examples: *query DB, read files, or serialize data into a string*.
917 | - Return `prep_res`, which is used by `exec()` and `post()`.
918 |
919 | 2. `exec(prep_res)`
920 | - **Execute compute logic**, with optional retries and error handling (below).
921 | - Examples: *(mostly) LLM calls, remote APIs, tool use*.
922 | - ⚠️ This shall be only for compute and **NOT** access `shared`.
923 | - ⚠️ If retries enabled, ensure idempotent implementation.
924 | - ⚠️ Defer exception handling to the Node's built-in retry mechanism.
925 | - Return `exec_res`, which is passed to `post()`.
926 |
927 | 3. `post(shared, prep_res, exec_res)`
928 | - **Postprocess and write data** back to `shared`.
929 | - Examples: *update DB, change states, log results*.
930 | - **Decide the next action** by returning a *string* (`action = "default"` if *None*).
931 |
932 | > **Why 3 steps?** To enforce the principle of *separation of concerns*. The data storage and data processing are operated separately.
933 | >
934 | > All steps are *optional*. E.g., you can only implement `prep` and `post` if you just need to process data.
935 | {: .note }
936 |
937 | ### Fault Tolerance & Retries
938 |
939 | You can **retry** `exec()` if it raises an exception via two parameters when define the Node:
940 |
941 | - `max_retries` (int): Max times to run `exec()`. The default is `1` (**no** retry).
942 | - `wait` (int): The time to wait (in **seconds**) before next retry. By default, `wait=0` (no waiting).
943 | `wait` is helpful when you encounter rate-limits or quota errors from your LLM provider and need to back off.
944 |
945 | ```python
946 | my_node = SummarizeFile(max_retries=3, wait=10)
947 | ```
948 |
949 | When an exception occurs in `exec()`, the Node automatically retries until:
950 |
951 | - It either succeeds, or
952 | - The Node has retried `max_retries - 1` times already and fails on the last attempt.
953 |
954 | You can get the current retry times (0-based) from `self.cur_retry`.
955 |
956 | ```python
957 | class RetryNode(Node):
958 | def exec(self, prep_res):
959 | print(f"Retry {self.cur_retry} times")
960 | raise Exception("Failed")
961 | ```
962 |
963 | ### Graceful Fallback
964 |
965 | To **gracefully handle** the exception (after all retries) rather than raising it, override:
966 |
967 | ```python
968 | def exec_fallback(self, prep_res, exc):
969 | raise exc
970 | ```
971 |
972 | By default, it just re-raises exception. But you can return a fallback result instead, which becomes the `exec_res` passed to `post()`.
973 |
974 | ### Example: Summarize file
975 |
976 | ```python
977 | class SummarizeFile(Node):
978 | def prep(self, shared):
979 | return shared["data"]
980 |
981 | def exec(self, prep_res):
982 | if not prep_res:
983 | return "Empty file content"
984 | prompt = f"Summarize this text in 10 words: {prep_res}"
985 | summary = call_llm(prompt) # might fail
986 | return summary
987 |
988 | def exec_fallback(self, prep_res, exc):
989 | # Provide a simple fallback instead of crashing
990 | return "There was an error processing your request."
991 |
992 | def post(self, shared, prep_res, exec_res):
993 | shared["summary"] = exec_res
994 | # Return "default" by not returning
995 |
996 | summarize_node = SummarizeFile(max_retries=3)
997 |
998 | # node.run() calls prep->exec->post
999 | # If exec() fails, it retries up to 3 times before calling exec_fallback()
1000 | action_result = summarize_node.run(shared)
1001 |
1002 | print("Action returned:", action_result) # "default"
1003 | print("Summary stored:", shared["summary"])
1004 | ```
1005 |
1006 | ================================================
1007 | File: docs/core_abstraction/parallel.md
1008 | ================================================
1009 | ---
1010 | layout: default
1011 | title: "(Advanced) Parallel"
1012 | parent: "Core Abstraction"
1013 | nav_order: 6
1014 | ---
1015 |
1016 | # (Advanced) Parallel
1017 |
1018 | **Parallel** Nodes and Flows let you run multiple **Async** Nodes and Flows **concurrently**—for example, summarizing multiple texts at once. This can improve performance by overlapping I/O and compute.
1019 |
1020 | > Because of Python’s GIL, parallel nodes and flows can’t truly parallelize CPU-bound tasks (e.g., heavy numerical computations). However, they excel at overlapping I/O-bound work—like LLM calls, database queries, API requests, or file I/O.
1021 | {: .warning }
1022 |
1023 | > - **Ensure Tasks Are Independent**: If each item depends on the output of a previous item, **do not** parallelize.
1024 | >
1025 | > - **Beware of Rate Limits**: Parallel calls can **quickly** trigger rate limits on LLM services. You may need a **throttling** mechanism (e.g., semaphores or sleep intervals).
1026 | >
1027 | > - **Consider Single-Node Batch APIs**: Some LLMs offer a **batch inference** API where you can send multiple prompts in a single call. This is more complex to implement but can be more efficient than launching many parallel requests and mitigates rate limits.
1028 | {: .best-practice }
1029 |
1030 | ## AsyncParallelBatchNode
1031 |
1032 | Like **AsyncBatchNode**, but run `exec_async()` in **parallel**:
1033 |
1034 | ```python
1035 | class ParallelSummaries(AsyncParallelBatchNode):
1036 | async def prep_async(self, shared):
1037 | # e.g., multiple texts
1038 | return shared["texts"]
1039 |
1040 | async def exec_async(self, text):
1041 | prompt = f"Summarize: {text}"
1042 | return await call_llm_async(prompt)
1043 |
1044 | async def post_async(self, shared, prep_res, exec_res_list):
1045 | shared["summary"] = "\n\n".join(exec_res_list)
1046 | return "default"
1047 |
1048 | node = ParallelSummaries()
1049 | flow = AsyncFlow(start=node)
1050 | ```
1051 |
1052 | ## AsyncParallelBatchFlow
1053 |
1054 | Parallel version of **BatchFlow**. Each iteration of the sub-flow runs **concurrently** using different parameters:
1055 |
1056 | ```python
1057 | class SummarizeMultipleFiles(AsyncParallelBatchFlow):
1058 | async def prep_async(self, shared):
1059 | return [{"filename": f} for f in shared["files"]]
1060 |
1061 | sub_flow = AsyncFlow(start=LoadAndSummarizeFile())
1062 | parallel_flow = SummarizeMultipleFiles(start=sub_flow)
1063 | await parallel_flow.run_async(shared)
1064 | ```
1065 |
1066 | ================================================
1067 | File: docs/design_pattern/agent.md
1068 | ================================================
1069 | ---
1070 | layout: default
1071 | title: "Agent"
1072 | parent: "Design Pattern"
1073 | nav_order: 1
1074 | ---
1075 |
1076 | # Agent
1077 |
1078 | Agent is a powerful design pattern in which nodes can take dynamic actions based on the context.
1079 |
1080 |
1081 |

1082 |
1083 |
1084 | ## Implement Agent with Graph
1085 |
1086 | 1. **Context and Action:** Implement nodes that supply context and perform actions.
1087 | 2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step.
1088 | 3. **Agent Node:** Provide a prompt to decide action—for example:
1089 |
1090 | ```python
1091 | f"""
1092 | ### CONTEXT
1093 | Task: {task_description}
1094 | Previous Actions: {previous_actions}
1095 | Current State: {current_state}
1096 |
1097 | ### ACTION SPACE
1098 | [1] search
1099 | Description: Use web search to get results
1100 | Parameters:
1101 | - query (str): What to search for
1102 |
1103 | [2] answer
1104 | Description: Conclude based on the results
1105 | Parameters:
1106 | - result (str): Final answer to provide
1107 |
1108 | ### NEXT ACTION
1109 | Decide the next action based on the current context and available action space.
1110 | Return your response in the following format:
1111 |
1112 | ```yaml
1113 | thinking: |
1114 |
1115 | action:
1116 | parameters:
1117 | :
1118 | ```"""
1119 | ```
1120 |
1121 | The core of building **high-performance** and **reliable** agents boils down to:
1122 |
1123 | 1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](./rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](https://arxiv.org/abs/2307.03172), overlooking mid-prompt content.
1124 |
1125 | 2. **Action Space:** Provide *a well-structured and unambiguous* set of actions—avoiding overlap like separate `read_databases` or `read_csvs`. Instead, import CSVs into the database.
1126 |
1127 | ## Example Good Action Design
1128 |
1129 | - **Incremental:** Feed content in manageable chunks (500 lines or 1 page) instead of all at once.
1130 |
1131 | - **Overview-zoom-in:** First provide high-level structure (table of contents, summary), then allow drilling into details (raw texts).
1132 |
1133 | - **Parameterized/Programmable:** Instead of fixed actions, enable parameterized (columns to select) or programmable (SQL queries) actions, for example, to read CSV files.
1134 |
1135 | - **Backtracking:** Let the agent undo the last step instead of restarting entirely, preserving progress when encountering errors or dead ends.
1136 |
1137 | ## Example: Search Agent
1138 |
1139 | This agent:
1140 | 1. Decides whether to search or answer
1141 | 2. If searches, loops back to decide if more search needed
1142 | 3. Answers when enough context gathered
1143 |
1144 | ```python
1145 | class DecideAction(Node):
1146 | def prep(self, shared):
1147 | context = shared.get("context", "No previous search")
1148 | query = shared["query"]
1149 | return query, context
1150 |
1151 | def exec(self, inputs):
1152 | query, context = inputs
1153 | prompt = f"""
1154 | Given input: {query}
1155 | Previous search results: {context}
1156 | Should I: 1) Search web for more info 2) Answer with current knowledge
1157 | Output in yaml:
1158 | ```yaml
1159 | action: search/answer
1160 | reason: why this action
1161 | search_term: search phrase if action is search
1162 | ```"""
1163 | resp = call_llm(prompt)
1164 | yaml_str = resp.split("```yaml")[1].split("```")[0].strip()
1165 | result = yaml.safe_load(yaml_str)
1166 |
1167 | assert isinstance(result, dict)
1168 | assert "action" in result
1169 | assert "reason" in result
1170 | assert result["action"] in ["search", "answer"]
1171 | if result["action"] == "search":
1172 | assert "search_term" in result
1173 |
1174 | return result
1175 |
1176 | def post(self, shared, prep_res, exec_res):
1177 | if exec_res["action"] == "search":
1178 | shared["search_term"] = exec_res["search_term"]
1179 | return exec_res["action"]
1180 |
1181 | class SearchWeb(Node):
1182 | def prep(self, shared):
1183 | return shared["search_term"]
1184 |
1185 | def exec(self, search_term):
1186 | return search_web(search_term)
1187 |
1188 | def post(self, shared, prep_res, exec_res):
1189 | prev_searches = shared.get("context", [])
1190 | shared["context"] = prev_searches + [
1191 | {"term": shared["search_term"], "result": exec_res}
1192 | ]
1193 | return "decide"
1194 |
1195 | class DirectAnswer(Node):
1196 | def prep(self, shared):
1197 | return shared["query"], shared.get("context", "")
1198 |
1199 | def exec(self, inputs):
1200 | query, context = inputs
1201 | return call_llm(f"Context: {context}\nAnswer: {query}")
1202 |
1203 | def post(self, shared, prep_res, exec_res):
1204 | print(f"Answer: {exec_res}")
1205 | shared["answer"] = exec_res
1206 |
1207 | # Connect nodes
1208 | decide = DecideAction()
1209 | search = SearchWeb()
1210 | answer = DirectAnswer()
1211 |
1212 | decide - "search" >> search
1213 | decide - "answer" >> answer
1214 | search - "decide" >> decide # Loop back
1215 |
1216 | flow = Flow(start=decide)
1217 | flow.run({"query": "Who won the Nobel Prize in Physics 2024?"})
1218 | ```
1219 |
1220 | ================================================
1221 | File: docs/design_pattern/mapreduce.md
1222 | ================================================
1223 | ---
1224 | layout: default
1225 | title: "Map Reduce"
1226 | parent: "Design Pattern"
1227 | nav_order: 4
1228 | ---
1229 |
1230 | # Map Reduce
1231 |
1232 | MapReduce is a design pattern suitable when you have either:
1233 | - Large input data (e.g., multiple files to process), or
1234 | - Large output data (e.g., multiple forms to fill)
1235 |
1236 | and there is a logical way to break the task into smaller, ideally independent parts.
1237 |
1238 |
1239 |

1240 |
1241 |
1242 | You first break down the task using [BatchNode](../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase.
1243 |
1244 | ### Example: Document Summarization
1245 |
1246 | ```python
1247 | class SummarizeAllFiles(BatchNode):
1248 | def prep(self, shared):
1249 | files_dict = shared["files"] # e.g. 10 files
1250 | return list(files_dict.items()) # [("file1.txt", "aaa..."), ("file2.txt", "bbb..."), ...]
1251 |
1252 | def exec(self, one_file):
1253 | filename, file_content = one_file
1254 | summary_text = call_llm(f"Summarize the following file:\n{file_content}")
1255 | return (filename, summary_text)
1256 |
1257 | def post(self, shared, prep_res, exec_res_list):
1258 | shared["file_summaries"] = dict(exec_res_list)
1259 |
1260 | class CombineSummaries(Node):
1261 | def prep(self, shared):
1262 | return shared["file_summaries"]
1263 |
1264 | def exec(self, file_summaries):
1265 | # format as: "File1: summary\nFile2: summary...\n"
1266 | text_list = []
1267 | for fname, summ in file_summaries.items():
1268 | text_list.append(f"{fname} summary:\n{summ}\n")
1269 | big_text = "\n---\n".join(text_list)
1270 |
1271 | return call_llm(f"Combine these file summaries into one final summary:\n{big_text}")
1272 |
1273 | def post(self, shared, prep_res, final_summary):
1274 | shared["all_files_summary"] = final_summary
1275 |
1276 | batch_node = SummarizeAllFiles()
1277 | combine_node = CombineSummaries()
1278 | batch_node >> combine_node
1279 |
1280 | flow = Flow(start=batch_node)
1281 |
1282 | shared = {
1283 | "files": {
1284 | "file1.txt": "Alice was beginning to get very tired of sitting by her sister...",
1285 | "file2.txt": "Some other interesting text ...",
1286 | # ...
1287 | }
1288 | }
1289 | flow.run(shared)
1290 | print("Individual Summaries:", shared["file_summaries"])
1291 | print("\nFinal Summary:\n", shared["all_files_summary"])
1292 | ```
1293 |
1294 | ================================================
1295 | File: docs/design_pattern/rag.md
1296 | ================================================
1297 | ---
1298 | layout: default
1299 | title: "RAG"
1300 | parent: "Design Pattern"
1301 | nav_order: 3
1302 | ---
1303 |
1304 | # RAG (Retrieval Augmented Generation)
1305 |
1306 | For certain LLM tasks like answering questions, providing relevant context is essential. One common architecture is a **two-stage** RAG pipeline:
1307 |
1308 |
1309 |

1310 |
1311 |
1312 | 1. **Offline stage**: Preprocess and index documents ("building the index").
1313 | 2. **Online stage**: Given a question, generate answers by retrieving the most relevant context.
1314 |
1315 | ---
1316 | ## Stage 1: Offline Indexing
1317 |
1318 | We create three Nodes:
1319 | 1. `ChunkDocs` – [chunks](../utility_function/chunking.md) raw text.
1320 | 2. `EmbedDocs` – [embeds](../utility_function/embedding.md) each chunk.
1321 | 3. `StoreIndex` – stores embeddings into a [vector database](../utility_function/vector.md).
1322 |
1323 | ```python
1324 | class ChunkDocs(BatchNode):
1325 | def prep(self, shared):
1326 | # A list of file paths in shared["files"]. We process each file.
1327 | return shared["files"]
1328 |
1329 | def exec(self, filepath):
1330 | # read file content. In real usage, do error handling.
1331 | with open(filepath, "r", encoding="utf-8") as f:
1332 | text = f.read()
1333 | # chunk by 100 chars each
1334 | chunks = []
1335 | size = 100
1336 | for i in range(0, len(text), size):
1337 | chunks.append(text[i : i + size])
1338 | return chunks
1339 |
1340 | def post(self, shared, prep_res, exec_res_list):
1341 | # exec_res_list is a list of chunk-lists, one per file.
1342 | # flatten them all into a single list of chunks.
1343 | all_chunks = []
1344 | for chunk_list in exec_res_list:
1345 | all_chunks.extend(chunk_list)
1346 | shared["all_chunks"] = all_chunks
1347 |
1348 | class EmbedDocs(BatchNode):
1349 | def prep(self, shared):
1350 | return shared["all_chunks"]
1351 |
1352 | def exec(self, chunk):
1353 | return get_embedding(chunk)
1354 |
1355 | def post(self, shared, prep_res, exec_res_list):
1356 | # Store the list of embeddings.
1357 | shared["all_embeds"] = exec_res_list
1358 | print(f"Total embeddings: {len(exec_res_list)}")
1359 |
1360 | class StoreIndex(Node):
1361 | def prep(self, shared):
1362 | # We'll read all embeds from shared.
1363 | return shared["all_embeds"]
1364 |
1365 | def exec(self, all_embeds):
1366 | # Create a vector index (faiss or other DB in real usage).
1367 | index = create_index(all_embeds)
1368 | return index
1369 |
1370 | def post(self, shared, prep_res, index):
1371 | shared["index"] = index
1372 |
1373 | # Wire them in sequence
1374 | chunk_node = ChunkDocs()
1375 | embed_node = EmbedDocs()
1376 | store_node = StoreIndex()
1377 |
1378 | chunk_node >> embed_node >> store_node
1379 |
1380 | OfflineFlow = Flow(start=chunk_node)
1381 | ```
1382 |
1383 | Usage example:
1384 |
1385 | ```python
1386 | shared = {
1387 | "files": ["doc1.txt", "doc2.txt"], # any text files
1388 | }
1389 | OfflineFlow.run(shared)
1390 | ```
1391 |
1392 | ---
1393 | ## Stage 2: Online Query & Answer
1394 |
1395 | We have 3 nodes:
1396 | 1. `EmbedQuery` – embeds the user’s question.
1397 | 2. `RetrieveDocs` – retrieves top chunk from the index.
1398 | 3. `GenerateAnswer` – calls the LLM with the question + chunk to produce the final answer.
1399 |
1400 | ```python
1401 | class EmbedQuery(Node):
1402 | def prep(self, shared):
1403 | return shared["question"]
1404 |
1405 | def exec(self, question):
1406 | return get_embedding(question)
1407 |
1408 | def post(self, shared, prep_res, q_emb):
1409 | shared["q_emb"] = q_emb
1410 |
1411 | class RetrieveDocs(Node):
1412 | def prep(self, shared):
1413 | # We'll need the query embedding, plus the offline index/chunks
1414 | return shared["q_emb"], shared["index"], shared["all_chunks"]
1415 |
1416 | def exec(self, inputs):
1417 | q_emb, index, chunks = inputs
1418 | I, D = search_index(index, q_emb, top_k=1)
1419 | best_id = I[0][0]
1420 | relevant_chunk = chunks[best_id]
1421 | return relevant_chunk
1422 |
1423 | def post(self, shared, prep_res, relevant_chunk):
1424 | shared["retrieved_chunk"] = relevant_chunk
1425 | print("Retrieved chunk:", relevant_chunk[:60], "...")
1426 |
1427 | class GenerateAnswer(Node):
1428 | def prep(self, shared):
1429 | return shared["question"], shared["retrieved_chunk"]
1430 |
1431 | def exec(self, inputs):
1432 | question, chunk = inputs
1433 | prompt = f"Question: {question}\nContext: {chunk}\nAnswer:"
1434 | return call_llm(prompt)
1435 |
1436 | def post(self, shared, prep_res, answer):
1437 | shared["answer"] = answer
1438 | print("Answer:", answer)
1439 |
1440 | embed_qnode = EmbedQuery()
1441 | retrieve_node = RetrieveDocs()
1442 | generate_node = GenerateAnswer()
1443 |
1444 | embed_qnode >> retrieve_node >> generate_node
1445 | OnlineFlow = Flow(start=embed_qnode)
1446 | ```
1447 |
1448 | Usage example:
1449 |
1450 | ```python
1451 | # Suppose we already ran OfflineFlow and have:
1452 | # shared["all_chunks"], shared["index"], etc.
1453 | shared["question"] = "Why do people like cats?"
1454 |
1455 | OnlineFlow.run(shared)
1456 | # final answer in shared["answer"]
1457 | ```
1458 |
1459 | ================================================
1460 | File: docs/design_pattern/structure.md
1461 | ================================================
1462 | ---
1463 | layout: default
1464 | title: "Structured Output"
1465 | parent: "Design Pattern"
1466 | nav_order: 5
1467 | ---
1468 |
1469 | # Structured Output
1470 |
1471 | In many use cases, you may want the LLM to output a specific structure, such as a list or a dictionary with predefined keys.
1472 |
1473 | There are several approaches to achieve a structured output:
1474 | - **Prompting** the LLM to strictly return a defined structure.
1475 | - Using LLMs that natively support **schema enforcement**.
1476 | - **Post-processing** the LLM's response to extract structured content.
1477 |
1478 | In practice, **Prompting** is simple and reliable for modern LLMs.
1479 |
1480 | ### Example Use Cases
1481 |
1482 | - Extracting Key Information
1483 |
1484 | ```yaml
1485 | product:
1486 | name: Widget Pro
1487 | price: 199.99
1488 | description: |
1489 | A high-quality widget designed for professionals.
1490 | Recommended for advanced users.
1491 | ```
1492 |
1493 | - Summarizing Documents into Bullet Points
1494 |
1495 | ```yaml
1496 | summary:
1497 | - This product is easy to use.
1498 | - It is cost-effective.
1499 | - Suitable for all skill levels.
1500 | ```
1501 |
1502 | - Generating Configuration Files
1503 |
1504 | ```yaml
1505 | server:
1506 | host: 127.0.0.1
1507 | port: 8080
1508 | ssl: true
1509 | ```
1510 |
1511 | ## Prompt Engineering
1512 |
1513 | When prompting the LLM to produce **structured** output:
1514 | 1. **Wrap** the structure in code fences (e.g., `yaml`).
1515 | 2. **Validate** that all required fields exist (and let `Node` handles retry).
1516 |
1517 | ### Example Text Summarization
1518 |
1519 | ```python
1520 | class SummarizeNode(Node):
1521 | def exec(self, prep_res):
1522 | # Suppose `prep_res` is the text to summarize.
1523 | prompt = f"""
1524 | Please summarize the following text as YAML, with exactly 3 bullet points
1525 |
1526 | {prep_res}
1527 |
1528 | Now, output:
1529 | ```yaml
1530 | summary:
1531 | - bullet 1
1532 | - bullet 2
1533 | - bullet 3
1534 | ```"""
1535 | response = call_llm(prompt)
1536 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
1537 |
1538 | import yaml
1539 | structured_result = yaml.safe_load(yaml_str)
1540 |
1541 | assert "summary" in structured_result
1542 | assert isinstance(structured_result["summary"], list)
1543 |
1544 | return structured_result
1545 | ```
1546 |
1547 | > Besides using `assert` statements, another popular way to validate schemas is [Pydantic](https://github.com/pydantic/pydantic)
1548 | {: .note }
1549 |
1550 | ### Why YAML instead of JSON?
1551 |
1552 | Current LLMs struggle with escaping. YAML is easier with strings since they don't always need quotes.
1553 |
1554 | **In JSON**
1555 |
1556 | ```json
1557 | {
1558 | "dialogue": "Alice said: \"Hello Bob.\\nHow are you?\\nI am good.\""
1559 | }
1560 | ```
1561 |
1562 | - Every double quote inside the string must be escaped with `\"`.
1563 | - Each newline in the dialogue must be represented as `\n`.
1564 |
1565 | **In YAML**
1566 |
1567 | ```yaml
1568 | dialogue: |
1569 | Alice said: "Hello Bob.
1570 | How are you?
1571 | I am good."
1572 | ```
1573 |
1574 | - No need to escape interior quotes—just place the entire text under a block literal (`|`).
1575 | - Newlines are naturally preserved without needing `\n`.
1576 |
1577 | ================================================
1578 | File: docs/design_pattern/workflow.md
1579 | ================================================
1580 | ---
1581 | layout: default
1582 | title: "Workflow"
1583 | parent: "Design Pattern"
1584 | nav_order: 2
1585 | ---
1586 |
1587 | # Workflow
1588 |
1589 | Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](../core_abstraction/flow.md) of multiple Nodes.
1590 |
1591 |
1592 |

1593 |
1594 |
1595 | > - You don't want to make each task **too coarse**, because it may be *too complex for one LLM call*.
1596 | > - You don't want to make each task **too granular**, because then *the LLM call doesn't have enough context* and results are *not consistent across nodes*.
1597 | >
1598 | > You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](./agent.md).
1599 | {: .best-practice }
1600 |
1601 | ### Example: Article Writing
1602 |
1603 | ```python
1604 | class GenerateOutline(Node):
1605 | def prep(self, shared): return shared["topic"]
1606 | def exec(self, topic): return call_llm(f"Create a detailed outline for an article about {topic}")
1607 | def post(self, shared, prep_res, exec_res): shared["outline"] = exec_res
1608 |
1609 | class WriteSection(Node):
1610 | def prep(self, shared): return shared["outline"]
1611 | def exec(self, outline): return call_llm(f"Write content based on this outline: {outline}")
1612 | def post(self, shared, prep_res, exec_res): shared["draft"] = exec_res
1613 |
1614 | class ReviewAndRefine(Node):
1615 | def prep(self, shared): return shared["draft"]
1616 | def exec(self, draft): return call_llm(f"Review and improve this draft: {draft}")
1617 | def post(self, shared, prep_res, exec_res): shared["final_article"] = exec_res
1618 |
1619 | # Connect nodes
1620 | outline = GenerateOutline()
1621 | write = WriteSection()
1622 | review = ReviewAndRefine()
1623 |
1624 | outline >> write >> review
1625 |
1626 | # Create and run flow
1627 | writing_flow = Flow(start=outline)
1628 | shared = {"topic": "AI Safety"}
1629 | writing_flow.run(shared)
1630 | ```
1631 |
1632 | For *dynamic cases*, consider using [Agents](./agent.md).
1633 |
1634 | ================================================
1635 | File: docs/utility_function/llm.md
1636 | ================================================
1637 | ---
1638 | layout: default
1639 | title: "LLM Wrapper"
1640 | parent: "Utility Function"
1641 | nav_order: 1
1642 | ---
1643 |
1644 | # LLM Wrappers
1645 |
1646 | Check out libraries like [litellm](https://github.com/BerriAI/litellm).
1647 | Here, we provide some minimal example implementations:
1648 |
1649 | 1. OpenAI
1650 | ```python
1651 | def call_llm(prompt):
1652 | from openai import OpenAI
1653 | client = OpenAI(api_key="YOUR_API_KEY_HERE")
1654 | r = client.chat.completions.create(
1655 | model="gpt-4o",
1656 | messages=[{"role": "user", "content": prompt}]
1657 | )
1658 | return r.choices[0].message.content
1659 |
1660 | # Example usage
1661 | call_llm("How are you?")
1662 | ```
1663 | > Store the API key in an environment variable like OPENAI_API_KEY for security.
1664 | {: .best-practice }
1665 |
1666 | 2. Claude (Anthropic)
1667 | ```python
1668 | def call_llm(prompt):
1669 | from anthropic import Anthropic
1670 | client = Anthropic(api_key="YOUR_API_KEY_HERE")
1671 | r = client.messages.create(
1672 | model="claude-sonnet-4-0",
1673 | messages=[
1674 | {"role": "user", "content": prompt}
1675 | ]
1676 | )
1677 | return r.content[0].text
1678 | ```
1679 |
1680 | 3. Google (Generative AI Studio / PaLM API)
1681 | ```python
1682 | def call_llm(prompt):
1683 | from google import genai
1684 | client = genai.Client(api_key='GEMINI_API_KEY')
1685 | response = client.models.generate_content(
1686 | model='gemini-2.5-pro',
1687 | contents=prompt
1688 | )
1689 | return response.text
1690 | ```
1691 |
1692 | 4. Azure (Azure OpenAI)
1693 | ```python
1694 | def call_llm(prompt):
1695 | from openai import AzureOpenAI
1696 | client = AzureOpenAI(
1697 | azure_endpoint="https://.openai.azure.com/",
1698 | api_key="YOUR_API_KEY_HERE",
1699 | api_version="2023-05-15"
1700 | )
1701 | r = client.chat.completions.create(
1702 | model="",
1703 | messages=[{"role": "user", "content": prompt}]
1704 | )
1705 | return r.choices[0].message.content
1706 | ```
1707 |
1708 | 5. Ollama (Local LLM)
1709 | ```python
1710 | def call_llm(prompt):
1711 | from ollama import chat
1712 | response = chat(
1713 | model="llama2",
1714 | messages=[{"role": "user", "content": prompt}]
1715 | )
1716 | return response.message.content
1717 | ```
1718 |
1719 | ## Improvements
1720 | Feel free to enhance your `call_llm` function as needed. Here are examples:
1721 |
1722 | - Handle chat history:
1723 |
1724 | ```python
1725 | def call_llm(messages):
1726 | from openai import OpenAI
1727 | client = OpenAI(api_key="YOUR_API_KEY_HERE")
1728 | r = client.chat.completions.create(
1729 | model="gpt-4o",
1730 | messages=messages
1731 | )
1732 | return r.choices[0].message.content
1733 | ```
1734 |
1735 | - Add in-memory caching
1736 |
1737 | ```python
1738 | from functools import lru_cache
1739 |
1740 | @lru_cache(maxsize=1000)
1741 | def call_llm(prompt):
1742 | # Your implementation here
1743 | pass
1744 | ```
1745 |
1746 | > ⚠️ Caching conflicts with Node retries, as retries yield the same result.
1747 | >
1748 | > To address this, you could use cached results only if not retried.
1749 | {: .warning }
1750 |
1751 |
1752 | ```python
1753 | from functools import lru_cache
1754 |
1755 | @lru_cache(maxsize=1000)
1756 | def cached_call(prompt):
1757 | pass
1758 |
1759 | def call_llm(prompt, use_cache):
1760 | if use_cache:
1761 | return cached_call(prompt)
1762 | # Call the underlying function directly
1763 | return cached_call.__wrapped__(prompt)
1764 |
1765 | class SummarizeNode(Node):
1766 | def exec(self, text):
1767 | return call_llm(f"Summarize: {text}", self.cur_retry==0)
1768 | ```
1769 |
1770 | - Enable logging:
1771 |
1772 | ```python
1773 | def call_llm(prompt):
1774 | import logging
1775 | logging.info(f"Prompt: {prompt}")
1776 | response = ... # Your implementation here
1777 | logging.info(f"Response: {response}")
1778 | return response
1779 | ```
--------------------------------------------------------------------------------
/.cursorrules:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | title: "Agentic Coding"
4 | ---
5 |
6 | # Agentic Coding: Humans Design, Agents code!
7 |
8 | > If you are an AI agent involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification.
9 | {: .warning }
10 |
11 | ## Agentic Coding Steps
12 |
13 | Agentic Coding should be a collaboration between Human System Design and Agent Implementation:
14 |
15 | | Steps | Human | AI | Comment |
16 | |:-----------------------|:----------:|:---------:|:------------------------------------------------------------------------|
17 | | 1. Requirements | ★★★ High | ★☆☆ Low | Humans understand the requirements and context. |
18 | | 2. Flow | ★★☆ Medium | ★★☆ Medium | Humans specify the high-level design, and the AI fills in the details. |
19 | | 3. Utilities | ★★☆ Medium | ★★☆ Medium | Humans provide available external APIs and integrations, and the AI helps with implementation. |
20 | | 4. Data | ★☆☆ Low | ★★★ High | AI designs the data schema, and humans verify. |
21 | | 5. Node | ★☆☆ Low | ★★★ High | The AI helps design the node based on the flow. |
22 | | 6. Implementation | ★☆☆ Low | ★★★ High | The AI implements the flow based on the design. |
23 | | 7. Optimization | ★★☆ Medium | ★★☆ Medium | Humans evaluate the results, and the AI helps optimize. |
24 | | 8. Reliability | ★☆☆ Low | ★★★ High | The AI writes test cases and addresses corner cases. |
25 |
26 | 1. **Requirements**: Clarify the requirements for your project, and evaluate whether an AI system is a good fit.
27 | - Understand AI systems' strengths and limitations:
28 | - **Good for**: Routine tasks requiring common sense (filling forms, replying to emails)
29 | - **Good for**: Creative tasks with well-defined inputs (building slides, writing SQL)
30 | - **Not good for**: Ambiguous problems requiring complex decision-making (business strategy, startup planning)
31 | - **Keep It User-Centric:** Explain the "problem" from the user's perspective rather than just listing features.
32 | - **Balance complexity vs. impact**: Aim to deliver the highest value features with minimal complexity early.
33 |
34 | 2. **Flow Design**: Outline at a high level, describe how your AI system orchestrates nodes.
35 | - Identify applicable design patterns (e.g., [Map Reduce](./design_pattern/mapreduce.md), [Agent](./design_pattern/agent.md), [RAG](./design_pattern/rag.md)).
36 | - For each node in the flow, start with a high-level one-line description of what it does.
37 | - If using **Map Reduce**, specify how to map (what to split) and how to reduce (how to combine).
38 | - If using **Agent**, specify what are the inputs (context) and what are the possible actions.
39 | - If using **RAG**, specify what to embed, noting that there's usually both offline (indexing) and online (retrieval) workflows.
40 | - Outline the flow and draw it in a mermaid diagram. For example:
41 | ```mermaid
42 | flowchart LR
43 | start[Start] --> batch[Batch]
44 | batch --> check[Check]
45 | check -->|OK| process
46 | check -->|Error| fix[Fix]
47 | fix --> check
48 |
49 | subgraph process[Process]
50 | step1[Step 1] --> step2[Step 2]
51 | end
52 |
53 | process --> endNode[End]
54 | ```
55 | - > **If Humans can't specify the flow, AI Agents can't automate it!** Before building an LLM system, thoroughly understand the problem and potential solution by manually solving example inputs to develop intuition.
56 | {: .best-practice }
57 |
58 | 3. **Utilities**: Based on the Flow Design, identify and implement necessary utility functions.
59 | - Think of your AI system as the brain. It needs a body—these *external utility functions*—to interact with the real world:
60 |
61 |
62 | - Reading inputs (e.g., retrieving Slack messages, reading emails)
63 | - Writing outputs (e.g., generating reports, sending emails)
64 | - Using external tools (e.g., calling LLMs, searching the web)
65 | - **NOTE**: *LLM-based tasks* (e.g., summarizing text, analyzing sentiment) are **NOT** utility functions; rather, they are *core functions* internal in the AI system.
66 | - For each utility function, implement it and write a simple test.
67 | - Document their input/output, as well as why they are necessary. For example:
68 | - `name`: `get_embedding` (`utils/get_embedding.py`)
69 | - `input`: `str`
70 | - `output`: a vector of 3072 floats
71 | - `necessity`: Used by the second node to embed text
72 | - Example utility implementation:
73 | ```python
74 | # utils/call_llm.py
75 | from openai import OpenAI
76 |
77 | def call_llm(prompt):
78 | client = OpenAI(api_key="YOUR_API_KEY_HERE")
79 | r = client.chat.completions.create(
80 | model="gpt-4o",
81 | messages=[{"role": "user", "content": prompt}]
82 | )
83 | return r.choices[0].message.content
84 |
85 | if __name__ == "__main__":
86 | prompt = "What is the meaning of life?"
87 | print(call_llm(prompt))
88 | ```
89 | - > **Sometimes, design Utilities before Flow:** For example, for an LLM project to automate a legacy system, the bottleneck will likely be the available interface to that system. Start by designing the hardest utilities for interfacing, and then build the flow around them.
90 | {: .best-practice }
91 | - > **Avoid Exception Handling in Utilities**: If a utility function is called from a Node's `exec()` method, avoid using `try...except` blocks within the utility. Let the Node's built-in retry mechanism handle failures.
92 | {: .warning }
93 |
94 | 4. **Data Design**: Design the shared store that nodes will use to communicate.
95 | - One core design principle for PocketFlow is to use a well-designed [shared store](./core_abstraction/communication.md)—a data contract that all nodes agree upon to retrieve and store data.
96 | - For simple systems, use an in-memory dictionary.
97 | - For more complex systems or when persistence is required, use a database.
98 | - **Don't Repeat Yourself**: Use in-memory references or foreign keys.
99 | - Example shared store design:
100 | ```python
101 | shared = {
102 | "user": {
103 | "id": "user123",
104 | "context": { # Another nested dict
105 | "weather": {"temp": 72, "condition": "sunny"},
106 | "location": "San Francisco"
107 | }
108 | },
109 | "results": {} # Empty dict to store outputs
110 | }
111 | ```
112 |
113 | 5. **Node Design**: Plan how each node will read and write data, and use utility functions.
114 | - For each [Node](./core_abstraction/node.md), describe its type, how it reads and writes data, and which utility function it uses. Keep it specific but high-level without codes. For example:
115 | - `type`: Regular (or Batch, or Async)
116 | - `prep`: Read "text" from the shared store
117 | - `exec`: Call the embedding utility function. **Avoid exception handling here**; let the Node's retry mechanism manage failures.
118 | - `post`: Write "embedding" to the shared store
119 |
120 | 6. **Implementation**: Implement the initial nodes and flows based on the design.
121 | - 🎉 If you've reached this step, humans have finished the design. Now *Agentic Coding* begins!
122 | - **"Keep it simple, stupid!"** Avoid complex features and full-scale type checking.
123 | - **FAIL FAST**! Leverage the built-in [Node](./core_abstraction/node.md) retry and fallback mechanisms to handle failures gracefully. This helps you quickly identify weak points in the system.
124 | - Add logging throughout the code to facilitate debugging.
125 |
126 | 7. **Optimization**:
127 | - **Use Intuition**: For a quick initial evaluation, human intuition is often a good start.
128 | - **Redesign Flow (Back to Step 3)**: Consider breaking down tasks further, introducing agentic decisions, or better managing input contexts.
129 | - If your flow design is already solid, move on to micro-optimizations:
130 | - **Prompt Engineering**: Use clear, specific instructions with examples to reduce ambiguity.
131 | - **In-Context Learning**: Provide robust examples for tasks that are difficult to specify with instructions alone.
132 |
133 | - > **You'll likely iterate a lot!** Expect to repeat Steps 3–6 hundreds of times.
134 | >
135 | >
136 | {: .best-practice }
137 |
138 | 8. **Reliability**
139 | - **Node Retries**: Add checks in the node `exec` to ensure outputs meet requirements, and consider increasing `max_retries` and `wait` times.
140 | - **Logging and Visualization**: Maintain logs of all attempts and visualize node results for easier debugging.
141 | - **Self-Evaluation**: Add a separate node (powered by an LLM) to review outputs when results are uncertain.
142 |
143 | ## Example LLM Project File Structure
144 |
145 | ```
146 | my_project/
147 | ├── main.py
148 | ├── nodes.py
149 | ├── flow.py
150 | ├── utils/
151 | │ ├── __init__.py
152 | │ ├── call_llm.py
153 | │ └── search_web.py
154 | ├── requirements.txt
155 | └── docs/
156 | └── design.md
157 | ```
158 |
159 | - **`requirements.txt`**: Lists the Python dependencies for the project.
160 | ```
161 | PyYAML
162 | pocketflow
163 | ```
164 |
165 | - **`docs/design.md`**: Contains project documentation for each step above. This should be *high-level* and *no-code*.
166 | ~~~
167 | # Design Doc: Your Project Name
168 |
169 | > Please DON'T remove notes for AI
170 |
171 | ## Requirements
172 |
173 | > Notes for AI: Keep it simple and clear.
174 | > If the requirements are abstract, write concrete user stories
175 |
176 |
177 | ## Flow Design
178 |
179 | > Notes for AI:
180 | > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit.
181 | > 2. Present a concise, high-level description of the workflow.
182 |
183 | ### Applicable Design Pattern:
184 |
185 | 1. Map the file summary into chunks, then reduce these chunks into a final summary.
186 | 2. Agentic file finder
187 | - *Context*: The entire summary of the file
188 | - *Action*: Find the file
189 |
190 | ### Flow high-level Design:
191 |
192 | 1. **First Node**: This node is for ...
193 | 2. **Second Node**: This node is for ...
194 | 3. **Third Node**: This node is for ...
195 |
196 | ```mermaid
197 | flowchart TD
198 | firstNode[First Node] --> secondNode[Second Node]
199 | secondNode --> thirdNode[Third Node]
200 | ```
201 | ## Utility Functions
202 |
203 | > Notes for AI:
204 | > 1. Understand the utility function definition thoroughly by reviewing the doc.
205 | > 2. Include only the necessary utility functions, based on nodes in the flow.
206 |
207 | 1. **Call LLM** (`utils/call_llm.py`)
208 | - *Input*: prompt (str)
209 | - *Output*: response (str)
210 | - Generally used by most nodes for LLM tasks
211 |
212 | 2. **Embedding** (`utils/get_embedding.py`)
213 | - *Input*: str
214 | - *Output*: a vector of 3072 floats
215 | - Used by the second node to embed text
216 |
217 | ## Node Design
218 |
219 | ### Shared Store
220 |
221 | > Notes for AI: Try to minimize data redundancy
222 |
223 | The shared store structure is organized as follows:
224 |
225 | ```python
226 | shared = {
227 | "key": "value"
228 | }
229 | ```
230 |
231 | ### Node Steps
232 |
233 | > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow.
234 |
235 | 1. First Node
236 | - *Purpose*: Provide a short explanation of the node’s function
237 | - *Type*: Decide between Regular, Batch, or Async
238 | - *Steps*:
239 | - *prep*: Read "key" from the shared store
240 | - *exec*: Call the utility function
241 | - *post*: Write "key" to the shared store
242 |
243 | 2. Second Node
244 | ...
245 | ~~~
246 |
247 |
248 | - **`utils/`**: Contains all utility functions.
249 | - It's recommended to dedicate one Python file to each API call, for example `call_llm.py` or `search_web.py`.
250 | - Each file should also include a `main()` function to try that API call
251 | ```python
252 | from google import genai
253 | import os
254 |
255 | def call_llm(prompt: str) -> str:
256 | client = genai.Client(
257 | api_key=os.getenv("GEMINI_API_KEY", ""),
258 | )
259 | model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
260 | response = client.models.generate_content(model=model, contents=[prompt])
261 | return response.text
262 |
263 | if __name__ == "__main__":
264 | test_prompt = "Hello, how are you?"
265 |
266 | # First call - should hit the API
267 | print("Making call...")
268 | response1 = call_llm(test_prompt, use_cache=False)
269 | print(f"Response: {response1}")
270 | ```
271 |
272 | - **`nodes.py`**: Contains all the node definitions.
273 | ```python
274 | # nodes.py
275 | from pocketflow import Node
276 | from utils.call_llm import call_llm
277 |
278 | class GetQuestionNode(Node):
279 | def exec(self, _):
280 | # Get question directly from user input
281 | user_question = input("Enter your question: ")
282 | return user_question
283 |
284 | def post(self, shared, prep_res, exec_res):
285 | # Store the user's question
286 | shared["question"] = exec_res
287 | return "default" # Go to the next node
288 |
289 | class AnswerNode(Node):
290 | def prep(self, shared):
291 | # Read question from shared
292 | return shared["question"]
293 |
294 | def exec(self, question):
295 | # Call LLM to get the answer
296 | return call_llm(question)
297 |
298 | def post(self, shared, prep_res, exec_res):
299 | # Store the answer in shared
300 | shared["answer"] = exec_res
301 | ```
302 | - **`flow.py`**: Implements functions that create flows by importing node definitions and connecting them.
303 | ```python
304 | # flow.py
305 | from pocketflow import Flow
306 | from nodes import GetQuestionNode, AnswerNode
307 |
308 | def create_qa_flow():
309 | """Create and return a question-answering flow."""
310 | # Create nodes
311 | get_question_node = GetQuestionNode()
312 | answer_node = AnswerNode()
313 |
314 | # Connect nodes in sequence
315 | get_question_node >> answer_node
316 |
317 | # Create flow starting with input node
318 | return Flow(start=get_question_node)
319 | ```
320 | - **`main.py`**: Serves as the project's entry point.
321 | ```python
322 | # main.py
323 | from flow import create_qa_flow
324 |
325 | # Example main function
326 | # Please replace this with your own main function
327 | def main():
328 | shared = {
329 | "question": None, # Will be populated by GetQuestionNode from user input
330 | "answer": None # Will be populated by AnswerNode
331 | }
332 |
333 | # Create the flow and run it
334 | qa_flow = create_qa_flow()
335 | qa_flow.run(shared)
336 | print(f"Question: {shared['question']}")
337 | print(f"Answer: {shared['answer']}")
338 |
339 | if __name__ == "__main__":
340 | main()
341 | ```
342 |
343 | ================================================
344 | File: docs/index.md
345 | ================================================
346 | ---
347 | layout: default
348 | title: "Home"
349 | nav_order: 1
350 | ---
351 |
352 | # Pocket Flow
353 |
354 | A [100-line](https://github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*.
355 |
356 | - **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in.
357 | - **Expressive**: Everything you love from larger frameworks—([Multi-](./design_pattern/multi_agent.html))[Agents](./design_pattern/agent.html), [Workflow](./design_pattern/workflow.html), [RAG](./design_pattern/rag.html), and more.
358 | - **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications.
359 |
360 |
361 |

362 |
363 |
364 | ## Core Abstraction
365 |
366 | We model the LLM workflow as a **Graph + Shared Store**:
367 |
368 | - [Node](./core_abstraction/node.md) handles simple (LLM) tasks.
369 | - [Flow](./core_abstraction/flow.md) connects nodes through **Actions** (labeled edges).
370 | - [Shared Store](./core_abstraction/communication.md) enables communication between nodes within flows.
371 | - [Batch](./core_abstraction/batch.md) nodes/flows allow for data-intensive tasks.
372 | - [Async](./core_abstraction/async.md) nodes/flows allow waiting for asynchronous tasks.
373 | - [(Advanced) Parallel](./core_abstraction/parallel.md) nodes/flows handle I/O-bound tasks.
374 |
375 |
376 |

377 |
378 |
379 | ## Design Pattern
380 |
381 | From there, it’s easy to implement popular design patterns:
382 |
383 | - [Agent](./design_pattern/agent.md) autonomously makes decisions.
384 | - [Workflow](./design_pattern/workflow.md) chains multiple tasks into pipelines.
385 | - [RAG](./design_pattern/rag.md) integrates data retrieval with generation.
386 | - [Map Reduce](./design_pattern/mapreduce.md) splits data tasks into Map and Reduce steps.
387 | - [Structured Output](./design_pattern/structure.md) formats outputs consistently.
388 | - [(Advanced) Multi-Agents](./design_pattern/multi_agent.md) coordinate multiple agents.
389 |
390 |
391 |

392 |
393 |
394 | ## Utility Function
395 |
396 | We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*:
397 |
398 | - [LLM Wrapper](./utility_function/llm.md)
399 | - [Viz and Debug](./utility_function/viz.md)
400 | - [Web Search](./utility_function/websearch.md)
401 | - [Chunking](./utility_function/chunking.md)
402 | - [Embedding](./utility_function/embedding.md)
403 | - [Vector Databases](./utility_function/vector.md)
404 | - [Text-to-Speech](./utility_function/text_to_speech.md)
405 |
406 | **Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework:
407 | - *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs.
408 | - *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally.
409 | - *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in.
410 |
411 | ## Ready to build your Apps?
412 |
413 | Check out [Agentic Coding Guidance](./guide.md), the fastest way to develop LLM projects with Pocket Flow!
414 |
415 | ================================================
416 | File: docs/core_abstraction/async.md
417 | ================================================
418 | ---
419 | layout: default
420 | title: "(Advanced) Async"
421 | parent: "Core Abstraction"
422 | nav_order: 5
423 | ---
424 |
425 | # (Advanced) Async
426 |
427 | **Async** Nodes implement `prep_async()`, `exec_async()`, `exec_fallback_async()`, and/or `post_async()`. This is useful for:
428 |
429 | 1. **prep_async()**: For *fetching/reading data (files, APIs, DB)* in an I/O-friendly way.
430 | 2. **exec_async()**: Typically used for async LLM calls.
431 | 3. **post_async()**: For *awaiting user feedback*, *coordinating across multi-agents* or any additional async steps after `exec_async()`.
432 |
433 | **Note**: `AsyncNode` must be wrapped in `AsyncFlow`. `AsyncFlow` can also include regular (sync) nodes.
434 |
435 | ### Example
436 |
437 | ```python
438 | class SummarizeThenVerify(AsyncNode):
439 | async def prep_async(self, shared):
440 | # Example: read a file asynchronously
441 | doc_text = await read_file_async(shared["doc_path"])
442 | return doc_text
443 |
444 | async def exec_async(self, prep_res):
445 | # Example: async LLM call
446 | summary = await call_llm_async(f"Summarize: {prep_res}")
447 | return summary
448 |
449 | async def post_async(self, shared, prep_res, exec_res):
450 | # Example: wait for user feedback
451 | decision = await gather_user_feedback(exec_res)
452 | if decision == "approve":
453 | shared["summary"] = exec_res
454 | return "approve"
455 | return "deny"
456 |
457 | summarize_node = SummarizeThenVerify()
458 | final_node = Finalize()
459 |
460 | # Define transitions
461 | summarize_node - "approve" >> final_node
462 | summarize_node - "deny" >> summarize_node # retry
463 |
464 | flow = AsyncFlow(start=summarize_node)
465 |
466 | async def main():
467 | shared = {"doc_path": "document.txt"}
468 | await flow.run_async(shared)
469 | print("Final Summary:", shared.get("summary"))
470 |
471 | asyncio.run(main())
472 | ```
473 |
474 | ================================================
475 | File: docs/core_abstraction/batch.md
476 | ================================================
477 | ---
478 | layout: default
479 | title: "Batch"
480 | parent: "Core Abstraction"
481 | nav_order: 4
482 | ---
483 |
484 | # Batch
485 |
486 | **Batch** makes it easier to handle large inputs in one Node or **rerun** a Flow multiple times. Example use cases:
487 | - **Chunk-based** processing (e.g., splitting large texts).
488 | - **Iterative** processing over lists of input items (e.g., user queries, files, URLs).
489 |
490 | ## 1. BatchNode
491 |
492 | A **BatchNode** extends `Node` but changes `prep()` and `exec()`:
493 |
494 | - **`prep(shared)`**: returns an **iterable** (e.g., list, generator).
495 | - **`exec(item)`**: called **once** per item in that iterable.
496 | - **`post(shared, prep_res, exec_res_list)`**: after all items are processed, receives a **list** of results (`exec_res_list`) and returns an **Action**.
497 |
498 |
499 | ### Example: Summarize a Large File
500 |
501 | ```python
502 | class MapSummaries(BatchNode):
503 | def prep(self, shared):
504 | # Suppose we have a big file; chunk it
505 | content = shared["data"]
506 | chunk_size = 10000
507 | chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
508 | return chunks
509 |
510 | def exec(self, chunk):
511 | prompt = f"Summarize this chunk in 10 words: {chunk}"
512 | summary = call_llm(prompt)
513 | return summary
514 |
515 | def post(self, shared, prep_res, exec_res_list):
516 | combined = "\n".join(exec_res_list)
517 | shared["summary"] = combined
518 | return "default"
519 |
520 | map_summaries = MapSummaries()
521 | flow = Flow(start=map_summaries)
522 | flow.run(shared)
523 | ```
524 |
525 | ---
526 |
527 | ## 2. BatchFlow
528 |
529 | A **BatchFlow** runs a **Flow** multiple times, each time with different `params`. Think of it as a loop that replays the Flow for each parameter set.
530 |
531 | ### Example: Summarize Many Files
532 |
533 | ```python
534 | class SummarizeAllFiles(BatchFlow):
535 | def prep(self, shared):
536 | # Return a list of param dicts (one per file)
537 | filenames = list(shared["data"].keys()) # e.g., ["file1.txt", "file2.txt", ...]
538 | return [{"filename": fn} for fn in filenames]
539 |
540 | # Suppose we have a per-file Flow (e.g., load_file >> summarize >> reduce):
541 | summarize_file = SummarizeFile(start=load_file)
542 |
543 | # Wrap that flow into a BatchFlow:
544 | summarize_all_files = SummarizeAllFiles(start=summarize_file)
545 | summarize_all_files.run(shared)
546 | ```
547 |
548 | ### Under the Hood
549 | 1. `prep(shared)` returns a list of param dicts—e.g., `[{filename: "file1.txt"}, {filename: "file2.txt"}, ...]`.
550 | 2. The **BatchFlow** loops through each dict. For each one:
551 | - It merges the dict with the BatchFlow’s own `params`.
552 | - It calls `flow.run(shared)` using the merged result.
553 | 3. This means the sub-Flow is run **repeatedly**, once for every param dict.
554 |
555 | ---
556 |
557 | ## 3. Nested or Multi-Level Batches
558 |
559 | You can nest a **BatchFlow** in another **BatchFlow**. For instance:
560 | - **Outer** batch: returns a list of diretory param dicts (e.g., `{"directory": "/pathA"}`, `{"directory": "/pathB"}`, ...).
561 | - **Inner** batch: returning a list of per-file param dicts.
562 |
563 | At each level, **BatchFlow** merges its own param dict with the parent’s. By the time you reach the **innermost** node, the final `params` is the merged result of **all** parents in the chain. This way, a nested structure can keep track of the entire context (e.g., directory + file name) at once.
564 |
565 | ```python
566 |
567 | class FileBatchFlow(BatchFlow):
568 | def prep(self, shared):
569 | directory = self.params["directory"]
570 | # e.g., files = ["file1.txt", "file2.txt", ...]
571 | files = [f for f in os.listdir(directory) if f.endswith(".txt")]
572 | return [{"filename": f} for f in files]
573 |
574 | class DirectoryBatchFlow(BatchFlow):
575 | def prep(self, shared):
576 | directories = [ "/path/to/dirA", "/path/to/dirB"]
577 | return [{"directory": d} for d in directories]
578 |
579 | # MapSummaries have params like {"directory": "/path/to/dirA", "filename": "file1.txt"}
580 | inner_flow = FileBatchFlow(start=MapSummaries())
581 | outer_flow = DirectoryBatchFlow(start=inner_flow)
582 | ```
583 |
584 | ================================================
585 | File: docs/core_abstraction/communication.md
586 | ================================================
587 | ---
588 | layout: default
589 | title: "Communication"
590 | parent: "Core Abstraction"
591 | nav_order: 3
592 | ---
593 |
594 | # Communication
595 |
596 | Nodes and Flows **communicate** in 2 ways:
597 |
598 | 1. **Shared Store (for almost all the cases)**
599 |
600 | - A global data structure (often an in-mem dict) that all nodes can read ( `prep()`) and write (`post()`).
601 | - Great for data results, large content, or anything multiple nodes need.
602 | - You shall design the data structure and populate it ahead.
603 |
604 | - > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*! This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](./batch.md).
605 | {: .best-practice }
606 |
607 | 2. **Params (only for [Batch](./batch.md))**
608 | - Each node has a local, ephemeral `params` dict passed in by the **parent Flow**, used as an identifier for tasks. Parameter keys and values shall be **immutable**.
609 | - Good for identifiers like filenames or numeric IDs, in Batch mode.
610 |
611 | If you know memory management, think of the **Shared Store** like a **heap** (shared by all function calls), and **Params** like a **stack** (assigned by the caller).
612 |
613 | ---
614 |
615 | ## 1. Shared Store
616 |
617 | ### Overview
618 |
619 | A shared store is typically an in-mem dictionary, like:
620 | ```python
621 | shared = {"data": {}, "summary": {}, "config": {...}, ...}
622 | ```
623 |
624 | It can also contain local file handlers, DB connections, or a combination for persistence. We recommend deciding the data structure or DB schema first based on your app requirements.
625 |
626 | ### Example
627 |
628 | ```python
629 | class LoadData(Node):
630 | def post(self, shared, prep_res, exec_res):
631 | # We write data to shared store
632 | shared["data"] = "Some text content"
633 | return None
634 |
635 | class Summarize(Node):
636 | def prep(self, shared):
637 | # We read data from shared store
638 | return shared["data"]
639 |
640 | def exec(self, prep_res):
641 | # Call LLM to summarize
642 | prompt = f"Summarize: {prep_res}"
643 | summary = call_llm(prompt)
644 | return summary
645 |
646 | def post(self, shared, prep_res, exec_res):
647 | # We write summary to shared store
648 | shared["summary"] = exec_res
649 | return "default"
650 |
651 | load_data = LoadData()
652 | summarize = Summarize()
653 | load_data >> summarize
654 | flow = Flow(start=load_data)
655 |
656 | shared = {}
657 | flow.run(shared)
658 | ```
659 |
660 | Here:
661 | - `LoadData` writes to `shared["data"]`.
662 | - `Summarize` reads from `shared["data"]`, summarizes, and writes to `shared["summary"]`.
663 |
664 | ---
665 |
666 | ## 2. Params
667 |
668 | **Params** let you store *per-Node* or *per-Flow* config that doesn't need to live in the shared store. They are:
669 | - **Immutable** during a Node's run cycle (i.e., they don't change mid-`prep->exec->post`).
670 | - **Set** via `set_params()`.
671 | - **Cleared** and updated each time a parent Flow calls it.
672 |
673 | > Only set the uppermost Flow params because others will be overwritten by the parent Flow.
674 | >
675 | > If you need to set child node params, see [Batch](./batch.md).
676 | {: .warning }
677 |
678 | Typically, **Params** are identifiers (e.g., file name, page number). Use them to fetch the task you assigned or write to a specific part of the shared store.
679 |
680 | ### Example
681 |
682 | ```python
683 | # 1) Create a Node that uses params
684 | class SummarizeFile(Node):
685 | def prep(self, shared):
686 | # Access the node's param
687 | filename = self.params["filename"]
688 | return shared["data"].get(filename, "")
689 |
690 | def exec(self, prep_res):
691 | prompt = f"Summarize: {prep_res}"
692 | return call_llm(prompt)
693 |
694 | def post(self, shared, prep_res, exec_res):
695 | filename = self.params["filename"]
696 | shared["summary"][filename] = exec_res
697 | return "default"
698 |
699 | # 2) Set params
700 | node = SummarizeFile()
701 |
702 | # 3) Set Node params directly (for testing)
703 | node.set_params({"filename": "doc1.txt"})
704 | node.run(shared)
705 |
706 | # 4) Create Flow
707 | flow = Flow(start=node)
708 |
709 | # 5) Set Flow params (overwrites node params)
710 | flow.set_params({"filename": "doc2.txt"})
711 | flow.run(shared) # The node summarizes doc2, not doc1
712 | ```
713 |
714 | ================================================
715 | File: docs/core_abstraction/flow.md
716 | ================================================
717 | ---
718 | layout: default
719 | title: "Flow"
720 | parent: "Core Abstraction"
721 | nav_order: 2
722 | ---
723 |
724 | # Flow
725 |
726 | A **Flow** orchestrates a graph of Nodes. You can chain Nodes in a sequence or create branching depending on the **Actions** returned from each Node's `post()`.
727 |
728 | ## 1. Action-based Transitions
729 |
730 | Each Node's `post()` returns an **Action** string. By default, if `post()` doesn't return anything, we treat that as `"default"`.
731 |
732 | You define transitions with the syntax:
733 |
734 | 1. **Basic default transition**: `node_a >> node_b`
735 | This means if `node_a.post()` returns `"default"`, go to `node_b`.
736 | (Equivalent to `node_a - "default" >> node_b`)
737 |
738 | 2. **Named action transition**: `node_a - "action_name" >> node_b`
739 | This means if `node_a.post()` returns `"action_name"`, go to `node_b`.
740 |
741 | It's possible to create loops, branching, or multi-step flows.
742 |
743 | ## 2. Creating a Flow
744 |
745 | A **Flow** begins with a **start** node. You call `Flow(start=some_node)` to specify the entry point. When you call `flow.run(shared)`, it executes the start node, looks at its returned Action from `post()`, follows the transition, and continues until there's no next node.
746 |
747 | ### Example: Simple Sequence
748 |
749 | Here's a minimal flow of two nodes in a chain:
750 |
751 | ```python
752 | node_a >> node_b
753 | flow = Flow(start=node_a)
754 | flow.run(shared)
755 | ```
756 |
757 | - When you run the flow, it executes `node_a`.
758 | - Suppose `node_a.post()` returns `"default"`.
759 | - The flow then sees `"default"` Action is linked to `node_b` and runs `node_b`.
760 | - `node_b.post()` returns `"default"` but we didn't define `node_b >> something_else`. So the flow ends there.
761 |
762 | ### Example: Branching & Looping
763 |
764 | Here's a simple expense approval flow that demonstrates branching and looping. The `ReviewExpense` node can return three possible Actions:
765 |
766 | - `"approved"`: expense is approved, move to payment processing
767 | - `"needs_revision"`: expense needs changes, send back for revision
768 | - `"rejected"`: expense is denied, finish the process
769 |
770 | We can wire them like this:
771 |
772 | ```python
773 | # Define the flow connections
774 | review - "approved" >> payment # If approved, process payment
775 | review - "needs_revision" >> revise # If needs changes, go to revision
776 | review - "rejected" >> finish # If rejected, finish the process
777 |
778 | revise >> review # After revision, go back for another review
779 | payment >> finish # After payment, finish the process
780 |
781 | flow = Flow(start=review)
782 | ```
783 |
784 | Let's see how it flows:
785 |
786 | 1. If `review.post()` returns `"approved"`, the expense moves to the `payment` node
787 | 2. If `review.post()` returns `"needs_revision"`, it goes to the `revise` node, which then loops back to `review`
788 | 3. If `review.post()` returns `"rejected"`, it moves to the `finish` node and stops
789 |
790 | ```mermaid
791 | flowchart TD
792 | review[Review Expense] -->|approved| payment[Process Payment]
793 | review -->|needs_revision| revise[Revise Report]
794 | review -->|rejected| finish[Finish Process]
795 |
796 | revise --> review
797 | payment --> finish
798 | ```
799 |
800 | ### Running Individual Nodes vs. Running a Flow
801 |
802 | - `node.run(shared)`: Just runs that node alone (calls `prep->exec->post()`), returns an Action.
803 | - `flow.run(shared)`: Executes from the start node, follows Actions to the next node, and so on until the flow can't continue.
804 |
805 | > `node.run(shared)` **does not** proceed to the successor.
806 | > This is mainly for debugging or testing a single node.
807 | >
808 | > Always use `flow.run(...)` in production to ensure the full pipeline runs correctly.
809 | {: .warning }
810 |
811 | ## 3. Nested Flows
812 |
813 | A **Flow** can act like a Node, which enables powerful composition patterns. This means you can:
814 |
815 | 1. Use a Flow as a Node within another Flow's transitions.
816 | 2. Combine multiple smaller Flows into a larger Flow for reuse.
817 | 3. Node `params` will be a merging of **all** parents' `params`.
818 |
819 | ### Flow's Node Methods
820 |
821 | A **Flow** is also a **Node**, so it will run `prep()` and `post()`. However:
822 |
823 | - It **won't** run `exec()`, as its main logic is to orchestrate its nodes.
824 | - `post()` always receives `None` for `exec_res` and should instead get the flow execution results from the shared store.
825 |
826 | ### Basic Flow Nesting
827 |
828 | Here's how to connect a flow to another node:
829 |
830 | ```python
831 | # Create a sub-flow
832 | node_a >> node_b
833 | subflow = Flow(start=node_a)
834 |
835 | # Connect it to another node
836 | subflow >> node_c
837 |
838 | # Create the parent flow
839 | parent_flow = Flow(start=subflow)
840 | ```
841 |
842 | When `parent_flow.run()` executes:
843 | 1. It starts `subflow`
844 | 2. `subflow` runs through its nodes (`node_a->node_b`)
845 | 3. After `subflow` completes, execution continues to `node_c`
846 |
847 | ### Example: Order Processing Pipeline
848 |
849 | Here's a practical example that breaks down order processing into nested flows:
850 |
851 | ```python
852 | # Payment processing sub-flow
853 | validate_payment >> process_payment >> payment_confirmation
854 | payment_flow = Flow(start=validate_payment)
855 |
856 | # Inventory sub-flow
857 | check_stock >> reserve_items >> update_inventory
858 | inventory_flow = Flow(start=check_stock)
859 |
860 | # Shipping sub-flow
861 | create_label >> assign_carrier >> schedule_pickup
862 | shipping_flow = Flow(start=create_label)
863 |
864 | # Connect the flows into a main order pipeline
865 | payment_flow >> inventory_flow >> shipping_flow
866 |
867 | # Create the master flow
868 | order_pipeline = Flow(start=payment_flow)
869 |
870 | # Run the entire pipeline
871 | order_pipeline.run(shared_data)
872 | ```
873 |
874 | This creates a clean separation of concerns while maintaining a clear execution path:
875 |
876 | ```mermaid
877 | flowchart LR
878 | subgraph order_pipeline[Order Pipeline]
879 | subgraph paymentFlow["Payment Flow"]
880 | A[Validate Payment] --> B[Process Payment] --> C[Payment Confirmation]
881 | end
882 |
883 | subgraph inventoryFlow["Inventory Flow"]
884 | D[Check Stock] --> E[Reserve Items] --> F[Update Inventory]
885 | end
886 |
887 | subgraph shippingFlow["Shipping Flow"]
888 | G[Create Label] --> H[Assign Carrier] --> I[Schedule Pickup]
889 | end
890 |
891 | paymentFlow --> inventoryFlow
892 | inventoryFlow --> shippingFlow
893 | end
894 | ```
895 |
896 | ================================================
897 | File: docs/core_abstraction/node.md
898 | ================================================
899 | ---
900 | layout: default
901 | title: "Node"
902 | parent: "Core Abstraction"
903 | nav_order: 1
904 | ---
905 |
906 | # Node
907 |
908 | A **Node** is the smallest building block. Each Node has 3 steps `prep->exec->post`:
909 |
910 |
911 |

912 |
913 |
914 | 1. `prep(shared)`
915 | - **Read and preprocess data** from `shared` store.
916 | - Examples: *query DB, read files, or serialize data into a string*.
917 | - Return `prep_res`, which is used by `exec()` and `post()`.
918 |
919 | 2. `exec(prep_res)`
920 | - **Execute compute logic**, with optional retries and error handling (below).
921 | - Examples: *(mostly) LLM calls, remote APIs, tool use*.
922 | - ⚠️ This shall be only for compute and **NOT** access `shared`.
923 | - ⚠️ If retries enabled, ensure idempotent implementation.
924 | - ⚠️ Defer exception handling to the Node's built-in retry mechanism.
925 | - Return `exec_res`, which is passed to `post()`.
926 |
927 | 3. `post(shared, prep_res, exec_res)`
928 | - **Postprocess and write data** back to `shared`.
929 | - Examples: *update DB, change states, log results*.
930 | - **Decide the next action** by returning a *string* (`action = "default"` if *None*).
931 |
932 | > **Why 3 steps?** To enforce the principle of *separation of concerns*. The data storage and data processing are operated separately.
933 | >
934 | > All steps are *optional*. E.g., you can only implement `prep` and `post` if you just need to process data.
935 | {: .note }
936 |
937 | ### Fault Tolerance & Retries
938 |
939 | You can **retry** `exec()` if it raises an exception via two parameters when define the Node:
940 |
941 | - `max_retries` (int): Max times to run `exec()`. The default is `1` (**no** retry).
942 | - `wait` (int): The time to wait (in **seconds**) before next retry. By default, `wait=0` (no waiting).
943 | `wait` is helpful when you encounter rate-limits or quota errors from your LLM provider and need to back off.
944 |
945 | ```python
946 | my_node = SummarizeFile(max_retries=3, wait=10)
947 | ```
948 |
949 | When an exception occurs in `exec()`, the Node automatically retries until:
950 |
951 | - It either succeeds, or
952 | - The Node has retried `max_retries - 1` times already and fails on the last attempt.
953 |
954 | You can get the current retry times (0-based) from `self.cur_retry`.
955 |
956 | ```python
957 | class RetryNode(Node):
958 | def exec(self, prep_res):
959 | print(f"Retry {self.cur_retry} times")
960 | raise Exception("Failed")
961 | ```
962 |
963 | ### Graceful Fallback
964 |
965 | To **gracefully handle** the exception (after all retries) rather than raising it, override:
966 |
967 | ```python
968 | def exec_fallback(self, prep_res, exc):
969 | raise exc
970 | ```
971 |
972 | By default, it just re-raises exception. But you can return a fallback result instead, which becomes the `exec_res` passed to `post()`.
973 |
974 | ### Example: Summarize file
975 |
976 | ```python
977 | class SummarizeFile(Node):
978 | def prep(self, shared):
979 | return shared["data"]
980 |
981 | def exec(self, prep_res):
982 | if not prep_res:
983 | return "Empty file content"
984 | prompt = f"Summarize this text in 10 words: {prep_res}"
985 | summary = call_llm(prompt) # might fail
986 | return summary
987 |
988 | def exec_fallback(self, prep_res, exc):
989 | # Provide a simple fallback instead of crashing
990 | return "There was an error processing your request."
991 |
992 | def post(self, shared, prep_res, exec_res):
993 | shared["summary"] = exec_res
994 | # Return "default" by not returning
995 |
996 | summarize_node = SummarizeFile(max_retries=3)
997 |
998 | # node.run() calls prep->exec->post
999 | # If exec() fails, it retries up to 3 times before calling exec_fallback()
1000 | action_result = summarize_node.run(shared)
1001 |
1002 | print("Action returned:", action_result) # "default"
1003 | print("Summary stored:", shared["summary"])
1004 | ```
1005 |
1006 | ================================================
1007 | File: docs/core_abstraction/parallel.md
1008 | ================================================
1009 | ---
1010 | layout: default
1011 | title: "(Advanced) Parallel"
1012 | parent: "Core Abstraction"
1013 | nav_order: 6
1014 | ---
1015 |
1016 | # (Advanced) Parallel
1017 |
1018 | **Parallel** Nodes and Flows let you run multiple **Async** Nodes and Flows **concurrently**—for example, summarizing multiple texts at once. This can improve performance by overlapping I/O and compute.
1019 |
1020 | > Because of Python’s GIL, parallel nodes and flows can’t truly parallelize CPU-bound tasks (e.g., heavy numerical computations). However, they excel at overlapping I/O-bound work—like LLM calls, database queries, API requests, or file I/O.
1021 | {: .warning }
1022 |
1023 | > - **Ensure Tasks Are Independent**: If each item depends on the output of a previous item, **do not** parallelize.
1024 | >
1025 | > - **Beware of Rate Limits**: Parallel calls can **quickly** trigger rate limits on LLM services. You may need a **throttling** mechanism (e.g., semaphores or sleep intervals).
1026 | >
1027 | > - **Consider Single-Node Batch APIs**: Some LLMs offer a **batch inference** API where you can send multiple prompts in a single call. This is more complex to implement but can be more efficient than launching many parallel requests and mitigates rate limits.
1028 | {: .best-practice }
1029 |
1030 | ## AsyncParallelBatchNode
1031 |
1032 | Like **AsyncBatchNode**, but run `exec_async()` in **parallel**:
1033 |
1034 | ```python
1035 | class ParallelSummaries(AsyncParallelBatchNode):
1036 | async def prep_async(self, shared):
1037 | # e.g., multiple texts
1038 | return shared["texts"]
1039 |
1040 | async def exec_async(self, text):
1041 | prompt = f"Summarize: {text}"
1042 | return await call_llm_async(prompt)
1043 |
1044 | async def post_async(self, shared, prep_res, exec_res_list):
1045 | shared["summary"] = "\n\n".join(exec_res_list)
1046 | return "default"
1047 |
1048 | node = ParallelSummaries()
1049 | flow = AsyncFlow(start=node)
1050 | ```
1051 |
1052 | ## AsyncParallelBatchFlow
1053 |
1054 | Parallel version of **BatchFlow**. Each iteration of the sub-flow runs **concurrently** using different parameters:
1055 |
1056 | ```python
1057 | class SummarizeMultipleFiles(AsyncParallelBatchFlow):
1058 | async def prep_async(self, shared):
1059 | return [{"filename": f} for f in shared["files"]]
1060 |
1061 | sub_flow = AsyncFlow(start=LoadAndSummarizeFile())
1062 | parallel_flow = SummarizeMultipleFiles(start=sub_flow)
1063 | await parallel_flow.run_async(shared)
1064 | ```
1065 |
1066 | ================================================
1067 | File: docs/design_pattern/agent.md
1068 | ================================================
1069 | ---
1070 | layout: default
1071 | title: "Agent"
1072 | parent: "Design Pattern"
1073 | nav_order: 1
1074 | ---
1075 |
1076 | # Agent
1077 |
1078 | Agent is a powerful design pattern in which nodes can take dynamic actions based on the context.
1079 |
1080 |
1081 |

1082 |
1083 |
1084 | ## Implement Agent with Graph
1085 |
1086 | 1. **Context and Action:** Implement nodes that supply context and perform actions.
1087 | 2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step.
1088 | 3. **Agent Node:** Provide a prompt to decide action—for example:
1089 |
1090 | ```python
1091 | f"""
1092 | ### CONTEXT
1093 | Task: {task_description}
1094 | Previous Actions: {previous_actions}
1095 | Current State: {current_state}
1096 |
1097 | ### ACTION SPACE
1098 | [1] search
1099 | Description: Use web search to get results
1100 | Parameters:
1101 | - query (str): What to search for
1102 |
1103 | [2] answer
1104 | Description: Conclude based on the results
1105 | Parameters:
1106 | - result (str): Final answer to provide
1107 |
1108 | ### NEXT ACTION
1109 | Decide the next action based on the current context and available action space.
1110 | Return your response in the following format:
1111 |
1112 | ```yaml
1113 | thinking: |
1114 |
1115 | action:
1116 | parameters:
1117 | :
1118 | ```"""
1119 | ```
1120 |
1121 | The core of building **high-performance** and **reliable** agents boils down to:
1122 |
1123 | 1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](./rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](https://arxiv.org/abs/2307.03172), overlooking mid-prompt content.
1124 |
1125 | 2. **Action Space:** Provide *a well-structured and unambiguous* set of actions—avoiding overlap like separate `read_databases` or `read_csvs`. Instead, import CSVs into the database.
1126 |
1127 | ## Example Good Action Design
1128 |
1129 | - **Incremental:** Feed content in manageable chunks (500 lines or 1 page) instead of all at once.
1130 |
1131 | - **Overview-zoom-in:** First provide high-level structure (table of contents, summary), then allow drilling into details (raw texts).
1132 |
1133 | - **Parameterized/Programmable:** Instead of fixed actions, enable parameterized (columns to select) or programmable (SQL queries) actions, for example, to read CSV files.
1134 |
1135 | - **Backtracking:** Let the agent undo the last step instead of restarting entirely, preserving progress when encountering errors or dead ends.
1136 |
1137 | ## Example: Search Agent
1138 |
1139 | This agent:
1140 | 1. Decides whether to search or answer
1141 | 2. If searches, loops back to decide if more search needed
1142 | 3. Answers when enough context gathered
1143 |
1144 | ```python
1145 | class DecideAction(Node):
1146 | def prep(self, shared):
1147 | context = shared.get("context", "No previous search")
1148 | query = shared["query"]
1149 | return query, context
1150 |
1151 | def exec(self, inputs):
1152 | query, context = inputs
1153 | prompt = f"""
1154 | Given input: {query}
1155 | Previous search results: {context}
1156 | Should I: 1) Search web for more info 2) Answer with current knowledge
1157 | Output in yaml:
1158 | ```yaml
1159 | action: search/answer
1160 | reason: why this action
1161 | search_term: search phrase if action is search
1162 | ```"""
1163 | resp = call_llm(prompt)
1164 | yaml_str = resp.split("```yaml")[1].split("```")[0].strip()
1165 | result = yaml.safe_load(yaml_str)
1166 |
1167 | assert isinstance(result, dict)
1168 | assert "action" in result
1169 | assert "reason" in result
1170 | assert result["action"] in ["search", "answer"]
1171 | if result["action"] == "search":
1172 | assert "search_term" in result
1173 |
1174 | return result
1175 |
1176 | def post(self, shared, prep_res, exec_res):
1177 | if exec_res["action"] == "search":
1178 | shared["search_term"] = exec_res["search_term"]
1179 | return exec_res["action"]
1180 |
1181 | class SearchWeb(Node):
1182 | def prep(self, shared):
1183 | return shared["search_term"]
1184 |
1185 | def exec(self, search_term):
1186 | return search_web(search_term)
1187 |
1188 | def post(self, shared, prep_res, exec_res):
1189 | prev_searches = shared.get("context", [])
1190 | shared["context"] = prev_searches + [
1191 | {"term": shared["search_term"], "result": exec_res}
1192 | ]
1193 | return "decide"
1194 |
1195 | class DirectAnswer(Node):
1196 | def prep(self, shared):
1197 | return shared["query"], shared.get("context", "")
1198 |
1199 | def exec(self, inputs):
1200 | query, context = inputs
1201 | return call_llm(f"Context: {context}\nAnswer: {query}")
1202 |
1203 | def post(self, shared, prep_res, exec_res):
1204 | print(f"Answer: {exec_res}")
1205 | shared["answer"] = exec_res
1206 |
1207 | # Connect nodes
1208 | decide = DecideAction()
1209 | search = SearchWeb()
1210 | answer = DirectAnswer()
1211 |
1212 | decide - "search" >> search
1213 | decide - "answer" >> answer
1214 | search - "decide" >> decide # Loop back
1215 |
1216 | flow = Flow(start=decide)
1217 | flow.run({"query": "Who won the Nobel Prize in Physics 2024?"})
1218 | ```
1219 |
1220 | ================================================
1221 | File: docs/design_pattern/mapreduce.md
1222 | ================================================
1223 | ---
1224 | layout: default
1225 | title: "Map Reduce"
1226 | parent: "Design Pattern"
1227 | nav_order: 4
1228 | ---
1229 |
1230 | # Map Reduce
1231 |
1232 | MapReduce is a design pattern suitable when you have either:
1233 | - Large input data (e.g., multiple files to process), or
1234 | - Large output data (e.g., multiple forms to fill)
1235 |
1236 | and there is a logical way to break the task into smaller, ideally independent parts.
1237 |
1238 |
1239 |

1240 |
1241 |
1242 | You first break down the task using [BatchNode](../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase.
1243 |
1244 | ### Example: Document Summarization
1245 |
1246 | ```python
1247 | class SummarizeAllFiles(BatchNode):
1248 | def prep(self, shared):
1249 | files_dict = shared["files"] # e.g. 10 files
1250 | return list(files_dict.items()) # [("file1.txt", "aaa..."), ("file2.txt", "bbb..."), ...]
1251 |
1252 | def exec(self, one_file):
1253 | filename, file_content = one_file
1254 | summary_text = call_llm(f"Summarize the following file:\n{file_content}")
1255 | return (filename, summary_text)
1256 |
1257 | def post(self, shared, prep_res, exec_res_list):
1258 | shared["file_summaries"] = dict(exec_res_list)
1259 |
1260 | class CombineSummaries(Node):
1261 | def prep(self, shared):
1262 | return shared["file_summaries"]
1263 |
1264 | def exec(self, file_summaries):
1265 | # format as: "File1: summary\nFile2: summary...\n"
1266 | text_list = []
1267 | for fname, summ in file_summaries.items():
1268 | text_list.append(f"{fname} summary:\n{summ}\n")
1269 | big_text = "\n---\n".join(text_list)
1270 |
1271 | return call_llm(f"Combine these file summaries into one final summary:\n{big_text}")
1272 |
1273 | def post(self, shared, prep_res, final_summary):
1274 | shared["all_files_summary"] = final_summary
1275 |
1276 | batch_node = SummarizeAllFiles()
1277 | combine_node = CombineSummaries()
1278 | batch_node >> combine_node
1279 |
1280 | flow = Flow(start=batch_node)
1281 |
1282 | shared = {
1283 | "files": {
1284 | "file1.txt": "Alice was beginning to get very tired of sitting by her sister...",
1285 | "file2.txt": "Some other interesting text ...",
1286 | # ...
1287 | }
1288 | }
1289 | flow.run(shared)
1290 | print("Individual Summaries:", shared["file_summaries"])
1291 | print("\nFinal Summary:\n", shared["all_files_summary"])
1292 | ```
1293 |
1294 | ================================================
1295 | File: docs/design_pattern/rag.md
1296 | ================================================
1297 | ---
1298 | layout: default
1299 | title: "RAG"
1300 | parent: "Design Pattern"
1301 | nav_order: 3
1302 | ---
1303 |
1304 | # RAG (Retrieval Augmented Generation)
1305 |
1306 | For certain LLM tasks like answering questions, providing relevant context is essential. One common architecture is a **two-stage** RAG pipeline:
1307 |
1308 |
1309 |

1310 |
1311 |
1312 | 1. **Offline stage**: Preprocess and index documents ("building the index").
1313 | 2. **Online stage**: Given a question, generate answers by retrieving the most relevant context.
1314 |
1315 | ---
1316 | ## Stage 1: Offline Indexing
1317 |
1318 | We create three Nodes:
1319 | 1. `ChunkDocs` – [chunks](../utility_function/chunking.md) raw text.
1320 | 2. `EmbedDocs` – [embeds](../utility_function/embedding.md) each chunk.
1321 | 3. `StoreIndex` – stores embeddings into a [vector database](../utility_function/vector.md).
1322 |
1323 | ```python
1324 | class ChunkDocs(BatchNode):
1325 | def prep(self, shared):
1326 | # A list of file paths in shared["files"]. We process each file.
1327 | return shared["files"]
1328 |
1329 | def exec(self, filepath):
1330 | # read file content. In real usage, do error handling.
1331 | with open(filepath, "r", encoding="utf-8") as f:
1332 | text = f.read()
1333 | # chunk by 100 chars each
1334 | chunks = []
1335 | size = 100
1336 | for i in range(0, len(text), size):
1337 | chunks.append(text[i : i + size])
1338 | return chunks
1339 |
1340 | def post(self, shared, prep_res, exec_res_list):
1341 | # exec_res_list is a list of chunk-lists, one per file.
1342 | # flatten them all into a single list of chunks.
1343 | all_chunks = []
1344 | for chunk_list in exec_res_list:
1345 | all_chunks.extend(chunk_list)
1346 | shared["all_chunks"] = all_chunks
1347 |
1348 | class EmbedDocs(BatchNode):
1349 | def prep(self, shared):
1350 | return shared["all_chunks"]
1351 |
1352 | def exec(self, chunk):
1353 | return get_embedding(chunk)
1354 |
1355 | def post(self, shared, prep_res, exec_res_list):
1356 | # Store the list of embeddings.
1357 | shared["all_embeds"] = exec_res_list
1358 | print(f"Total embeddings: {len(exec_res_list)}")
1359 |
1360 | class StoreIndex(Node):
1361 | def prep(self, shared):
1362 | # We'll read all embeds from shared.
1363 | return shared["all_embeds"]
1364 |
1365 | def exec(self, all_embeds):
1366 | # Create a vector index (faiss or other DB in real usage).
1367 | index = create_index(all_embeds)
1368 | return index
1369 |
1370 | def post(self, shared, prep_res, index):
1371 | shared["index"] = index
1372 |
1373 | # Wire them in sequence
1374 | chunk_node = ChunkDocs()
1375 | embed_node = EmbedDocs()
1376 | store_node = StoreIndex()
1377 |
1378 | chunk_node >> embed_node >> store_node
1379 |
1380 | OfflineFlow = Flow(start=chunk_node)
1381 | ```
1382 |
1383 | Usage example:
1384 |
1385 | ```python
1386 | shared = {
1387 | "files": ["doc1.txt", "doc2.txt"], # any text files
1388 | }
1389 | OfflineFlow.run(shared)
1390 | ```
1391 |
1392 | ---
1393 | ## Stage 2: Online Query & Answer
1394 |
1395 | We have 3 nodes:
1396 | 1. `EmbedQuery` – embeds the user’s question.
1397 | 2. `RetrieveDocs` – retrieves top chunk from the index.
1398 | 3. `GenerateAnswer` – calls the LLM with the question + chunk to produce the final answer.
1399 |
1400 | ```python
1401 | class EmbedQuery(Node):
1402 | def prep(self, shared):
1403 | return shared["question"]
1404 |
1405 | def exec(self, question):
1406 | return get_embedding(question)
1407 |
1408 | def post(self, shared, prep_res, q_emb):
1409 | shared["q_emb"] = q_emb
1410 |
1411 | class RetrieveDocs(Node):
1412 | def prep(self, shared):
1413 | # We'll need the query embedding, plus the offline index/chunks
1414 | return shared["q_emb"], shared["index"], shared["all_chunks"]
1415 |
1416 | def exec(self, inputs):
1417 | q_emb, index, chunks = inputs
1418 | I, D = search_index(index, q_emb, top_k=1)
1419 | best_id = I[0][0]
1420 | relevant_chunk = chunks[best_id]
1421 | return relevant_chunk
1422 |
1423 | def post(self, shared, prep_res, relevant_chunk):
1424 | shared["retrieved_chunk"] = relevant_chunk
1425 | print("Retrieved chunk:", relevant_chunk[:60], "...")
1426 |
1427 | class GenerateAnswer(Node):
1428 | def prep(self, shared):
1429 | return shared["question"], shared["retrieved_chunk"]
1430 |
1431 | def exec(self, inputs):
1432 | question, chunk = inputs
1433 | prompt = f"Question: {question}\nContext: {chunk}\nAnswer:"
1434 | return call_llm(prompt)
1435 |
1436 | def post(self, shared, prep_res, answer):
1437 | shared["answer"] = answer
1438 | print("Answer:", answer)
1439 |
1440 | embed_qnode = EmbedQuery()
1441 | retrieve_node = RetrieveDocs()
1442 | generate_node = GenerateAnswer()
1443 |
1444 | embed_qnode >> retrieve_node >> generate_node
1445 | OnlineFlow = Flow(start=embed_qnode)
1446 | ```
1447 |
1448 | Usage example:
1449 |
1450 | ```python
1451 | # Suppose we already ran OfflineFlow and have:
1452 | # shared["all_chunks"], shared["index"], etc.
1453 | shared["question"] = "Why do people like cats?"
1454 |
1455 | OnlineFlow.run(shared)
1456 | # final answer in shared["answer"]
1457 | ```
1458 |
1459 | ================================================
1460 | File: docs/design_pattern/structure.md
1461 | ================================================
1462 | ---
1463 | layout: default
1464 | title: "Structured Output"
1465 | parent: "Design Pattern"
1466 | nav_order: 5
1467 | ---
1468 |
1469 | # Structured Output
1470 |
1471 | In many use cases, you may want the LLM to output a specific structure, such as a list or a dictionary with predefined keys.
1472 |
1473 | There are several approaches to achieve a structured output:
1474 | - **Prompting** the LLM to strictly return a defined structure.
1475 | - Using LLMs that natively support **schema enforcement**.
1476 | - **Post-processing** the LLM's response to extract structured content.
1477 |
1478 | In practice, **Prompting** is simple and reliable for modern LLMs.
1479 |
1480 | ### Example Use Cases
1481 |
1482 | - Extracting Key Information
1483 |
1484 | ```yaml
1485 | product:
1486 | name: Widget Pro
1487 | price: 199.99
1488 | description: |
1489 | A high-quality widget designed for professionals.
1490 | Recommended for advanced users.
1491 | ```
1492 |
1493 | - Summarizing Documents into Bullet Points
1494 |
1495 | ```yaml
1496 | summary:
1497 | - This product is easy to use.
1498 | - It is cost-effective.
1499 | - Suitable for all skill levels.
1500 | ```
1501 |
1502 | - Generating Configuration Files
1503 |
1504 | ```yaml
1505 | server:
1506 | host: 127.0.0.1
1507 | port: 8080
1508 | ssl: true
1509 | ```
1510 |
1511 | ## Prompt Engineering
1512 |
1513 | When prompting the LLM to produce **structured** output:
1514 | 1. **Wrap** the structure in code fences (e.g., `yaml`).
1515 | 2. **Validate** that all required fields exist (and let `Node` handles retry).
1516 |
1517 | ### Example Text Summarization
1518 |
1519 | ```python
1520 | class SummarizeNode(Node):
1521 | def exec(self, prep_res):
1522 | # Suppose `prep_res` is the text to summarize.
1523 | prompt = f"""
1524 | Please summarize the following text as YAML, with exactly 3 bullet points
1525 |
1526 | {prep_res}
1527 |
1528 | Now, output:
1529 | ```yaml
1530 | summary:
1531 | - bullet 1
1532 | - bullet 2
1533 | - bullet 3
1534 | ```"""
1535 | response = call_llm(prompt)
1536 | yaml_str = response.split("```yaml")[1].split("```")[0].strip()
1537 |
1538 | import yaml
1539 | structured_result = yaml.safe_load(yaml_str)
1540 |
1541 | assert "summary" in structured_result
1542 | assert isinstance(structured_result["summary"], list)
1543 |
1544 | return structured_result
1545 | ```
1546 |
1547 | > Besides using `assert` statements, another popular way to validate schemas is [Pydantic](https://github.com/pydantic/pydantic)
1548 | {: .note }
1549 |
1550 | ### Why YAML instead of JSON?
1551 |
1552 | Current LLMs struggle with escaping. YAML is easier with strings since they don't always need quotes.
1553 |
1554 | **In JSON**
1555 |
1556 | ```json
1557 | {
1558 | "dialogue": "Alice said: \"Hello Bob.\\nHow are you?\\nI am good.\""
1559 | }
1560 | ```
1561 |
1562 | - Every double quote inside the string must be escaped with `\"`.
1563 | - Each newline in the dialogue must be represented as `\n`.
1564 |
1565 | **In YAML**
1566 |
1567 | ```yaml
1568 | dialogue: |
1569 | Alice said: "Hello Bob.
1570 | How are you?
1571 | I am good."
1572 | ```
1573 |
1574 | - No need to escape interior quotes—just place the entire text under a block literal (`|`).
1575 | - Newlines are naturally preserved without needing `\n`.
1576 |
1577 | ================================================
1578 | File: docs/design_pattern/workflow.md
1579 | ================================================
1580 | ---
1581 | layout: default
1582 | title: "Workflow"
1583 | parent: "Design Pattern"
1584 | nav_order: 2
1585 | ---
1586 |
1587 | # Workflow
1588 |
1589 | Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](../core_abstraction/flow.md) of multiple Nodes.
1590 |
1591 |
1592 |

1593 |
1594 |
1595 | > - You don't want to make each task **too coarse**, because it may be *too complex for one LLM call*.
1596 | > - You don't want to make each task **too granular**, because then *the LLM call doesn't have enough context* and results are *not consistent across nodes*.
1597 | >
1598 | > You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](./agent.md).
1599 | {: .best-practice }
1600 |
1601 | ### Example: Article Writing
1602 |
1603 | ```python
1604 | class GenerateOutline(Node):
1605 | def prep(self, shared): return shared["topic"]
1606 | def exec(self, topic): return call_llm(f"Create a detailed outline for an article about {topic}")
1607 | def post(self, shared, prep_res, exec_res): shared["outline"] = exec_res
1608 |
1609 | class WriteSection(Node):
1610 | def prep(self, shared): return shared["outline"]
1611 | def exec(self, outline): return call_llm(f"Write content based on this outline: {outline}")
1612 | def post(self, shared, prep_res, exec_res): shared["draft"] = exec_res
1613 |
1614 | class ReviewAndRefine(Node):
1615 | def prep(self, shared): return shared["draft"]
1616 | def exec(self, draft): return call_llm(f"Review and improve this draft: {draft}")
1617 | def post(self, shared, prep_res, exec_res): shared["final_article"] = exec_res
1618 |
1619 | # Connect nodes
1620 | outline = GenerateOutline()
1621 | write = WriteSection()
1622 | review = ReviewAndRefine()
1623 |
1624 | outline >> write >> review
1625 |
1626 | # Create and run flow
1627 | writing_flow = Flow(start=outline)
1628 | shared = {"topic": "AI Safety"}
1629 | writing_flow.run(shared)
1630 | ```
1631 |
1632 | For *dynamic cases*, consider using [Agents](./agent.md).
1633 |
1634 | ================================================
1635 | File: docs/utility_function/llm.md
1636 | ================================================
1637 | ---
1638 | layout: default
1639 | title: "LLM Wrapper"
1640 | parent: "Utility Function"
1641 | nav_order: 1
1642 | ---
1643 |
1644 | # LLM Wrappers
1645 |
1646 | Check out libraries like [litellm](https://github.com/BerriAI/litellm).
1647 | Here, we provide some minimal example implementations:
1648 |
1649 | 1. OpenAI
1650 | ```python
1651 | def call_llm(prompt):
1652 | from openai import OpenAI
1653 | client = OpenAI(api_key="YOUR_API_KEY_HERE")
1654 | r = client.chat.completions.create(
1655 | model="gpt-4o",
1656 | messages=[{"role": "user", "content": prompt}]
1657 | )
1658 | return r.choices[0].message.content
1659 |
1660 | # Example usage
1661 | call_llm("How are you?")
1662 | ```
1663 | > Store the API key in an environment variable like OPENAI_API_KEY for security.
1664 | {: .best-practice }
1665 |
1666 | 2. Claude (Anthropic)
1667 | ```python
1668 | def call_llm(prompt):
1669 | from anthropic import Anthropic
1670 | client = Anthropic(api_key="YOUR_API_KEY_HERE")
1671 | r = client.messages.create(
1672 | model="claude-sonnet-4-0",
1673 | messages=[
1674 | {"role": "user", "content": prompt}
1675 | ]
1676 | )
1677 | return r.content[0].text
1678 | ```
1679 |
1680 | 3. Google (Generative AI Studio / PaLM API)
1681 | ```python
1682 | def call_llm(prompt):
1683 | from google import genai
1684 | client = genai.Client(api_key='GEMINI_API_KEY')
1685 | response = client.models.generate_content(
1686 | model='gemini-2.5-pro',
1687 | contents=prompt
1688 | )
1689 | return response.text
1690 | ```
1691 |
1692 | 4. Azure (Azure OpenAI)
1693 | ```python
1694 | def call_llm(prompt):
1695 | from openai import AzureOpenAI
1696 | client = AzureOpenAI(
1697 | azure_endpoint="https://.openai.azure.com/",
1698 | api_key="YOUR_API_KEY_HERE",
1699 | api_version="2023-05-15"
1700 | )
1701 | r = client.chat.completions.create(
1702 | model="",
1703 | messages=[{"role": "user", "content": prompt}]
1704 | )
1705 | return r.choices[0].message.content
1706 | ```
1707 |
1708 | 5. Ollama (Local LLM)
1709 | ```python
1710 | def call_llm(prompt):
1711 | from ollama import chat
1712 | response = chat(
1713 | model="llama2",
1714 | messages=[{"role": "user", "content": prompt}]
1715 | )
1716 | return response.message.content
1717 | ```
1718 |
1719 | ## Improvements
1720 | Feel free to enhance your `call_llm` function as needed. Here are examples:
1721 |
1722 | - Handle chat history:
1723 |
1724 | ```python
1725 | def call_llm(messages):
1726 | from openai import OpenAI
1727 | client = OpenAI(api_key="YOUR_API_KEY_HERE")
1728 | r = client.chat.completions.create(
1729 | model="gpt-4o",
1730 | messages=messages
1731 | )
1732 | return r.choices[0].message.content
1733 | ```
1734 |
1735 | - Add in-memory caching
1736 |
1737 | ```python
1738 | from functools import lru_cache
1739 |
1740 | @lru_cache(maxsize=1000)
1741 | def call_llm(prompt):
1742 | # Your implementation here
1743 | pass
1744 | ```
1745 |
1746 | > ⚠️ Caching conflicts with Node retries, as retries yield the same result.
1747 | >
1748 | > To address this, you could use cached results only if not retried.
1749 | {: .warning }
1750 |
1751 |
1752 | ```python
1753 | from functools import lru_cache
1754 |
1755 | @lru_cache(maxsize=1000)
1756 | def cached_call(prompt):
1757 | pass
1758 |
1759 | def call_llm(prompt, use_cache):
1760 | if use_cache:
1761 | return cached_call(prompt)
1762 | # Call the underlying function directly
1763 | return cached_call.__wrapped__(prompt)
1764 |
1765 | class SummarizeNode(Node):
1766 | def exec(self, text):
1767 | return call_llm(f"Summarize: {text}", self.cur_retry==0)
1768 | ```
1769 |
1770 | - Enable logging:
1771 |
1772 | ```python
1773 | def call_llm(prompt):
1774 | import logging
1775 | logging.info(f"Prompt: {prompt}")
1776 | response = ... # Your implementation here
1777 | logging.info(f"Response: {response}")
1778 | return response
1779 | ```
--------------------------------------------------------------------------------