├── utils
    ├── __init__.py
    └── call_llm.py
├── requirements.txt
├── assets
    └── banner.png
├── flow.py
├── .gitignore
├── main.py
├── README.md
├── docs
    └── design.md
├── data_profiling_report.md
├── test
    └── patients.csv
├── nodes.py
├── .clinerules
└── .cursorrules


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pocketflow>=0.0.1
2 | pandas>=2.0.0
3 | PyYAML>=6.0
4 | openai>=1.0.0
5 | google-genai


--------------------------------------------------------------------------------
/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-Pocket/PocketFlow-Tutorial-Data-Profiler/main/assets/banner.png


--------------------------------------------------------------------------------
/utils/call_llm.py:
--------------------------------------------------------------------------------
 1 | from google import genai
 2 | import os
 3 | 
 4 | def call_llm(prompt: str) -> str:
 5 |     """
 6 |     Call Google Gemini LLM with the given prompt.
 7 |     
 8 |     Args:
 9 |         prompt (str): The prompt to send to the LLM
10 |         
11 |     Returns:
12 |         str: The response from the LLM
13 |     """
14 |     api_key = os.getenv("GEMINI_API_KEY", "Your API Key")
15 |     client = genai.Client(api_key=api_key)
16 |     model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro")
17 |     
18 |     response = client.models.generate_content(
19 |         model=model, 
20 |         contents=[prompt]
21 |     )
22 |     return response.text
23 | 
24 | if __name__ == "__main__":
25 |     test_prompt = "Hello, how are you?"
26 |     
27 |     print("Making call...")
28 |     response = call_llm(test_prompt)
29 |     print(f"Response: {response}")


--------------------------------------------------------------------------------
/flow.py:
--------------------------------------------------------------------------------
 1 | from pocketflow import Flow
 2 | from nodes import (
 3 |     DuplicateDetectionNode, 
 4 |     TableSummaryNode, 
 5 |     ColumnDescriptionNode,
 6 |     DataTypeAnalysisNode, 
 7 |     MissingValuesAnalysisNode, 
 8 |     UniquenessAnalysisNode,
 9 |     UnusualValuesDetectionNode, 
10 |     GenerateReportNode
11 | )
12 | 
13 | def create_data_profiling_flow():
14 |     """Create and return a data profiling flow."""
15 |     
16 |     # Create all nodes
17 |     duplicate_node = DuplicateDetectionNode()
18 |     summary_node = TableSummaryNode()
19 |     column_desc_node = ColumnDescriptionNode()
20 |     data_type_node = DataTypeAnalysisNode()
21 |     missing_values_node = MissingValuesAnalysisNode()
22 |     uniqueness_node = UniquenessAnalysisNode()
23 |     unusual_values_node = UnusualValuesDetectionNode()
24 |     report_node = GenerateReportNode()
25 |     
26 |     # Connect nodes in sequence (following the workflow design)
27 |     duplicate_node >> summary_node >> column_desc_node >> data_type_node >> missing_values_node >> uniqueness_node >> unusual_values_node >> report_node
28 |     
29 |     # Create flow starting with duplicate detection
30 |     return Flow(start=duplicate_node)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | node_modules/
 3 | vendor/
 4 | .pnp/
 5 | .pnp.js
 6 | 
 7 | # Build outputs
 8 | dist/
 9 | build/
10 | out/
11 | *.pyc
12 | __pycache__/
13 | 
14 | # Environment files
15 | .env
16 | .env.local
17 | .env.*.local
18 | .env.development
19 | .env.test
20 | .env.production
21 | 
22 | # IDE - VSCode
23 | .vscode/*
24 | !.vscode/settings.json
25 | !.vscode/tasks.json
26 | !.vscode/launch.json
27 | !.vscode/extensions.json
28 | 
29 | # IDE - JetBrains
30 | .idea/
31 | *.iml
32 | *.iws
33 | *.ipr
34 | 
35 | # IDE - Eclipse
36 | .project
37 | .classpath
38 | .settings/
39 | 
40 | # Logs
41 | logs/
42 | *.log
43 | npm-debug.log*
44 | yarn-debug.log*
45 | yarn-error.log*
46 | 
47 | # Operating System
48 | .DS_Store
49 | Thumbs.db
50 | *.swp
51 | *.swo
52 | 
53 | # Testing
54 | coverage/
55 | .nyc_output/
56 | 
57 | # Temporary files
58 | *.tmp
59 | *.temp
60 | .cache/
61 | 
62 | # Compiled files
63 | *.com
64 | *.class
65 | *.dll
66 | *.exe
67 | *.o
68 | *.so
69 | 
70 | # Package files
71 | *.7z
72 | *.dmg
73 | *.gz
74 | *.iso
75 | *.jar
76 | *.rar
77 | *.tar
78 | *.zip
79 | 
80 | # Database
81 | *.sqlite
82 | *.sqlite3
83 | *.db
84 | 
85 | # Optional npm cache directory
86 | .npm
87 | 
88 | # Optional eslint cache
89 | .eslintcache
90 | 
91 | # Optional REPL history
92 | .node_repl_history 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from flow import create_data_profiling_flow
 3 | 
 4 | def main():
 5 |     """Main function for data profiling"""
 6 |     
 7 |     # Load the test dataset
 8 |     print("Loading patient data...")
 9 |     df = pd.read_csv("test/patients.csv")
10 |     print(f"Loaded {len(df)} rows and {len(df.columns)} columns")
11 |     
12 |     # Initialize shared store with the data profiling structure
13 |     shared = {
14 |         "dataframe": df,
15 |         "sample_data": "",
16 |         "profile_results": {
17 |             "duplicates": {},
18 |             "table_summary": "",
19 |             "column_descriptions": {},
20 |             "data_types": {},
21 |             "missing_values": {},
22 |             "uniqueness": {},
23 |             "unusual_values": {}
24 |         },
25 |         "final_report": ""
26 |     }
27 |     
28 |     # Create and run the data profiling flow
29 |     print("\nStarting data profiling analysis...")
30 |     profiling_flow = create_data_profiling_flow()
31 |     profiling_flow.run(shared)
32 |     
33 |     # Save the report first (avoid console encoding issues)
34 |     with open("data_profiling_report.md", "w", encoding="utf-8") as f:
35 |         f.write(shared["final_report"])
36 |     print("\nReport saved to: data_profiling_report.md")
37 |     print(f"Report contains {len(shared['final_report'])} characters")
38 |     
39 |     # Show basic stats instead of full report
40 |     print("\n" + "="*50 + " SUMMARY " + "="*50)
41 |     dup = shared["profile_results"]["duplicates"]
42 |     print(f"✓ Analyzed {dup['total_rows']} rows, {len(shared['dataframe'].columns)} columns")
43 |     print(f"✓ Found {dup['count']} duplicate rows ({dup['percentage']:.1f}%)")
44 |     print(f"✓ Analysis complete - check data_profiling_report.md for full details")
45 |     print("="*108)
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PocketFlow Data Profiling Tool
  2 | 
  3 | An intelligent data profiling tool powered by LLMs that provides deep, contextual analysis of your datasets beyond traditional statistical metrics.
  4 | 
  5 | ## 🎯 What This Tool Does
  6 | 
  7 | This tool performs comprehensive data profiling through a 7-step workflow:
  8 | 
  9 | 1. **Duplicate Detection** - Identifies and analyzes duplicate rows with recommendations
 10 | 2. **Table Summary** - Generates high-level description of what your data represents
 11 | 3. **Column Descriptions** - Analyzes each column with meaningful descriptions and naming suggestions
 12 | 4. **Data Type Analysis** - Recommends optimal data types for each column
 13 | 5. **Missing Values Analysis** - Categorizes missing values as meaningful vs problematic
 14 | 6. **Uniqueness Analysis** - Identifies potential unique identifier columns
 15 | 7. **Unusual Values Detection** - Detects outliers, anomalies, and data quality issues
 16 | 
 17 | ## 🚀 How to Run
 18 | 
 19 | ### Prerequisites
 20 | 
 21 | 1. **Install dependencies:**
 22 | ```bash
 23 | pip install -r requirements.txt
 24 | ```
 25 | 
 26 | 2. **Set up your LLM:**
 27 | 
 28 | The tool uses OpenAI by default. Set your API key:
 29 | ```bash
 30 | export OPENAI_API_KEY="your-key-here"
 31 | ```
 32 | 
 33 | To use your own LLM or different providers, check out the [PocketFlow LLM documentation](https://the-pocket.github.io/PocketFlow/utility_function/llm.html) and modify `utils/call_llm.py` accordingly.
 34 | 
 35 | **Test your LLM setup:**
 36 | ```bash
 37 | python utils/call_llm.py
 38 | ```
 39 | 
 40 | ### Running the Tool
 41 | 
 42 | ```bash
 43 | python main.py
 44 | ```
 45 | 
 46 | By default, it analyzes the sample patient dataset in `test/patients.csv`. To analyze your own data, modify `main.py`:
 47 | 
 48 | ```python
 49 | # Replace this line:
 50 | df = pd.read_csv("test/patients.csv")
 51 | 
 52 | # With your data:
 53 | df = pd.read_csv("path/to/your/data.csv")
 54 | ```
 55 | 
 56 | ### Output
 57 | 
 58 | The tool generates:
 59 | - **Console summary** with key statistics
 60 | - **Markdown report** saved as `data_profiling_report.md` with comprehensive analysis
 61 | 
 62 | ## 📊 Example Results
 63 | 
 64 | From the sample patient dataset (60 rows, 27 columns):
 65 | 
 66 | - ✅ Detected invalid SSN formats (test data with "999" prefix)
 67 | - ✅ Identified name contamination (numeric suffixes in names)
 68 | - ✅ Found meaningful missing patterns (83% missing death dates = living patients)
 69 | - ✅ Recommended data type conversions (dates to datetime64, categories for demographics)
 70 | - ✅ Identified unique identifiers (UUID primary key, SSN)
 71 | 
 72 | ## 🏗️ Architecture
 73 | 
 74 | Built with [PocketFlow](https://github.com/The-Pocket/PocketFlow) - a minimalist LLM framework:
 75 | 
 76 | - **Workflow pattern** for sequential processing pipeline
 77 | - **BatchNode** for efficient parallel column analysis
 78 | - **YAML-based** structured outputs with validation
 79 | - **Intelligent LLM analysis** for contextual understanding
 80 | 
 81 | ## 📁 Project Structure
 82 | 
 83 | ```
 84 | ├── main.py                 # Entry point
 85 | ├── flow.py                 # Flow orchestrator
 86 | ├── nodes.py                # All profiling nodes
 87 | ├── utils/
 88 | │   └── call_llm.py        # LLM utility (customize for your provider)
 89 | ├── test/
 90 | │   └── patients.csv       # Sample dataset
 91 | └── docs/
 92 |     └── design.md          # Design documentation
 93 | ```
 94 | 
 95 | ## 🔧 Customization
 96 | 
 97 | ### Using Different LLM Providers
 98 | 
 99 | Edit `utils/call_llm.py` to use your preferred LLM:
100 | - Claude (Anthropic)
101 | - Google Gemini
102 | - Azure OpenAI
103 | - Local models (Ollama)
104 | 
105 | See the [PocketFlow LLM guide](https://the-pocket.github.io/PocketFlow/utility_function/llm.html) for examples.
106 | 
107 | ### Analyzing Different Data Types
108 | 
109 | The tool works with any pandas DataFrame. You can:
110 | - Load from CSV, Excel, JSON, Parquet
111 | - Connect to databases
112 | - Use API data
113 | 
114 | Just ensure your data is loaded as a pandas DataFrame before running the flow.
115 | 
116 | ## 🎓 Tutorial
117 | 
118 | This project demonstrates **Agentic Coding** with [PocketFlow](https://github.com/The-Pocket/PocketFlow). Want to learn more?
119 | 
120 | - Check out the [Agentic Coding Guidance](https://the-pocket.github.io/PocketFlow/guide.html)
121 | - Watch the [YouTube Tutorial](https://www.youtube.com/@ZacharyLLM?sub_confirmation=1)
122 | 
123 | ## 📝 License
124 | 
125 | This project is a tutorial example for PocketFlow.
126 | 


--------------------------------------------------------------------------------
/docs/design.md:
--------------------------------------------------------------------------------
  1 | # Design Doc: Data Profiling Tool
  2 | 
  3 | > Please DON'T remove notes for AI
  4 | 
  5 | ## Requirements
  6 | 
  7 | > Notes for AI: Keep it simple and clear.
  8 | > If the requirements are abstract, write concrete user stories
  9 | 
 10 | **Problem**: Users need to understand their pandas DataFrame data quality and characteristics before analysis or modeling.
 11 | 
 12 | **User Stories**:
 13 | - As a data scientist, I want to automatically detect duplicate rows so I can decide whether to remove them
 14 | - As an analyst, I want a high-level summary of my table to understand what the data represents
 15 | - As a data engineer, I want detailed column descriptions to understand each field's meaning
 16 | - As a developer, I want to identify correct data types for proper processing
 17 | - As a researcher, I want to find missing values and understand if they're meaningful or problematic
 18 | - As a quality analyst, I want to identify unique columns that could serve as identifiers
 19 | - As a data validator, I want to detect unusual/outlier values that may indicate data quality issues
 20 | 
 21 | ## Flow Design
 22 | 
 23 | > Notes for AI:
 24 | > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit.
 25 | > 2. Present a concise, high-level description of the workflow.
 26 | 
 27 | ### Applicable Design Pattern:
 28 | 
 29 | 1. **Workflow**: Sequential processing pipeline where each step builds upon previous analysis
 30 | 2. **Batch**: Some nodes (like column analysis) process multiple columns in parallel for efficiency
 31 | 
 32 | ### Flow High-level Design:
 33 | 
 34 | 1. **Duplicate Detection Node**: Analyzes the DataFrame for duplicate rows and provides statistics
 35 | 2. **Table Summary Node**: Creates a high-level description of what the table represents
 36 | 3. **Column Description Node**: Analyzes each column to provide meaningful descriptions and suggest better names
 37 | 4. **Data Type Analysis Node**: Determines appropriate data types for each column
 38 | 5. **Missing Values Analysis Node**: Identifies missing values and categorizes them as meaningful vs problematic
 39 | 6. **Uniqueness Analysis Node**: Identifies columns that could serve as unique identifiers
 40 | 7. **Unusual Values Detection Node**: Detects outliers and anomalous values in each column
 41 | 
 42 | ```mermaid
 43 | flowchart TD
 44 |     start[Start: Load DataFrame] --> duplicate[Duplicate Detection]
 45 |     duplicate --> summary[Table Summary]
 46 |     summary --> columns[Column Descriptions]
 47 |     columns --> datatypes[Data Type Analysis]
 48 |     datatypes --> missing[Missing Values Analysis]
 49 |     missing --> unique[Uniqueness Analysis]
 50 |     unique --> unusual[Unusual Values Detection]
 51 |     unusual --> report[Generate Final Report]
 52 | ```
 53 | 
 54 | ## Utility Functions
 55 | 
 56 | > Notes for AI:
 57 | > 1. Understand the utility function definition thoroughly by reviewing the doc.
 58 | > 2. Include only the necessary utility functions, based on nodes in the flow.
 59 | 
 60 | 1. **Call LLM** (`utils/call_llm.py`)
 61 |    - *Input*: prompt (str)
 62 |    - *Output*: response (str)
 63 |    - Used by all analysis nodes for intelligent data interpretation
 64 | 
 65 | ## Node Design
 66 | 
 67 | ### Shared Store
 68 | 
 69 | > Notes for AI: Try to minimize data redundancy
 70 | 
 71 | The shared store structure is organized as follows:
 72 | 
 73 | ```python
 74 | shared = {
 75 |     "dataframe": pd.DataFrame,          # Original DataFrame
 76 |     "sample_data": str,                 # CSV sample for LLM analysis
 77 |     "profile_results": {
 78 |         "duplicates": {
 79 |             "count": int,
 80 |             "percentage": float,
 81 |             "sample_rows": str
 82 |         },
 83 |         "table_summary": str,
 84 |         "column_descriptions": {
 85 |             "col_name": {
 86 |                 "description": str,
 87 |                 "suggested_name": str
 88 |             }
 89 |         },
 90 |         "data_types": {
 91 |             "col_name": {
 92 |                 "current_type": str,
 93 |                 "suggested_type": str,
 94 |                 "confidence": float
 95 |             }
 96 |         },
 97 |         "missing_values": {
 98 |             "col_name": {
 99 |                 "count": int,
100 |                 "percentage": float,
101 |                 "likely_meaningful": bool,
102 |                 "reason": str
103 |             }
104 |         },
105 |         "uniqueness": {
106 |             "col_name": {
107 |                 "unique_count": int,
108 |                 "unique_percentage": float,
109 |                 "is_candidate_key": bool
110 |             }
111 |         },
112 |         "unusual_values": {
113 |             "col_name": {
114 |                 "has_unusual": bool,
115 |                 "unusual_samples": list,
116 |                 "explanation": str
117 |             }
118 |         }
119 |     },
120 |     "final_report": str                 # Comprehensive profiling report
121 | }
122 | ```
123 | 
124 | ### Node Steps
125 | 
126 | > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow.
127 | 
128 | 1. **Duplicate Detection Node**
129 |    - *Purpose*: Detect and analyze duplicate rows in the DataFrame
130 |    - *Type*: Regular Node
131 |    - *Steps*:
132 |      - *prep*: Read "dataframe" from shared store and create sample
133 |      - *exec*: Call LLM to analyze duplicate patterns and significance
134 |      - *post*: Write duplicate analysis to "profile_results.duplicates"
135 | 
136 | 2. **Table Summary Node**
137 |    - *Purpose*: Generate high-level description of the table's purpose and content
138 |    - *Type*: Regular Node
139 |    - *Steps*:
140 |      - *prep*: Read "dataframe" sample and column names from shared store
141 |      - *exec*: Call LLM to generate comprehensive table summary
142 |      - *post*: Write summary to "profile_results.table_summary"
143 | 
144 | 3. **Column Description Node**
145 |    - *Purpose*: Analyze each column to provide descriptions and name suggestions
146 |    - *Type*: Batch Node (processes columns in chunks)
147 |    - *Steps*:
148 |      - *prep*: Return list of column chunks for parallel processing
149 |      - *exec*: Call LLM to analyze each column chunk for descriptions
150 |      - *post*: Combine results and write to "profile_results.column_descriptions"
151 | 
152 | 4. **Data Type Analysis Node**
153 |    - *Purpose*: Determine appropriate data types for each column
154 |    - *Type*: Regular Node
155 |    - *Steps*:
156 |      - *prep*: Read "dataframe" and column info from shared store
157 |      - *exec*: Call LLM to analyze data types with sample data
158 |      - *post*: Write type analysis to "profile_results.data_types"
159 | 
160 | 5. **Missing Values Analysis Node**
161 |    - *Purpose*: Analyze missing values to determine if they're meaningful or problematic
162 |    - *Type*: Regular Node
163 |    - *Steps*:
164 |      - *prep*: Read "dataframe" and calculate missing value statistics
165 |      - *exec*: Call LLM to determine if missing values are meaningful
166 |      - *post*: Write missing value analysis to "profile_results.missing_values"
167 | 
168 | 6. **Uniqueness Analysis Node**
169 |    - *Purpose*: Identify columns that could serve as unique identifiers
170 |    - *Type*: Regular Node
171 |    - *Steps*:
172 |      - *prep*: Read "dataframe" and calculate uniqueness statistics
173 |      - *exec*: Call LLM to determine candidate key columns
174 |      - *post*: Write uniqueness analysis to "profile_results.uniqueness"
175 | 
176 | 7. **Unusual Values Detection Node**
177 |    - *Purpose*: Detect outliers and anomalous values in columns
178 |    - *Type*: Batch Node (processes columns individually)
179 |    - *Steps*:
180 |      - *prep*: Return list of columns to analyze for unusual values
181 |      - *exec*: Call LLM to analyze each column's value patterns
182 |      - *post*: Write unusual value findings to "profile_results.unusual_values"
183 | 
184 | 


--------------------------------------------------------------------------------
/data_profiling_report.md:
--------------------------------------------------------------------------------
  1 | # Data Profiling Report
  2 | 
  3 | ## Table Summary
  4 | This table represents a collection of detailed personal records for individuals. Each person is identified by an **Id**, and may also have an **SSN**, **DRIVERS** license, or **PASSPORT** number. Their full name is detailed with **PREFIX**, **FIRST**, **LAST**, **SUFFIX**, and a **MAIDEN** name if applicable.
  5 | 
  6 | The records include vital and demographic information such as **BIRTHDATE**, **DEATHDATE**, **MARITAL** status, **RACE**, **ETHNICITY**, and **GENDER**. Geographic information specifies the person's **BIRTHPLACE** and their current residential **ADDRESS**, **CITY**, **STATE**, **COUNTY**, **FIPS** code, **ZIP**, and geographic coordinates (**LAT**, **LON**). Finally, the table contains financial information related to an individual's **HEALTHCARE_EXPENSES**, **HEALTHCARE_COVERAGE**, and **INCOME**.
  7 | 
  8 | ## Duplicate Analysis
  9 | - **Total rows**: 60
 10 | - **Duplicate rows**: 0 (0.00%)
 11 | - **Should remove**: False
 12 | - **Analysis**: No duplicate rows found in the dataset.
 13 | 
 14 | ## Column Descriptions
 15 | - **Id** → *person_id*: A unique identifier for each record, formatted as a UUID (Universally Unique Identifier).
 16 | - **BIRTHDATE** → *birth_date*: The person's date of birth in YYYY-MM-DD format.
 17 | - **DEATHDATE** → *death_date*: The person's date of death in YYYY-MM-DD format. This field is empty if the person is alive.
 18 | - **SSN** → *social_security_number*: The person's 9-digit Social Security Number, formatted as XXX-XX-XXXX.
 19 | - **DRIVERS** → *drivers_license_number*: The person's driver's license number.
 20 | - **PASSPORT** → *passport_number*: The person's passport number.
 21 | - **PREFIX** → *name_prefix*: A title or honorific that precedes a person's name (e.g., 'Mr.', 'Mrs.', 'Dr.').
 22 | - **FIRST** → *first_name*: The person's first or given name.
 23 | - **LAST** → *last_name*: The person's last or family name.
 24 | - **SUFFIX** → *name_suffix*: A suffix that follows a person's full name (e.g., 'Jr.', 'Sr.', 'III').
 25 | - **MAIDEN** → *maiden_name*: The individual's last name at birth, often used for married individuals who have changed their name. Appears to have null values for those it does not apply to.
 26 | - **MARITAL** → *marital_status*: The individual's marital status. The sample data uses 'M' likely for 'Married'.
 27 | - **RACE** → *race*: The individual's self-identified race.
 28 | - **ETHNICITY** → *ethnicity*: The individual's self-identified ethnicity, primarily indicating Hispanic or Non-Hispanic origin.
 29 | - **GENDER** → *gender*: The individual's gender, represented by 'M' for Male and 'F' for Female.
 30 | - **BIRTHPLACE** → *birth_place*: The location where the individual was born, as a single string containing city, state, and country.
 31 | - **ADDRESS** → *street_address*: The street address of the individual's residence, including building number, street name, and unit/apartment number.
 32 | - **CITY** → *city*: The city of the individual's residential address.
 33 | - **STATE** → *state*: The state of the individual's residential address.
 34 | - **COUNTY** → *county*: The county of the individual's residential address.
 35 | - **FIPS** → *fips_code*: A FIPS (Federal Information Processing Standard) code, likely identifying a US county.
 36 | - **ZIP** → *zip_code*: The 5-digit US postal ZIP code for the location.
 37 | - **LAT** → *latitude*: The geographic latitude coordinate for the location.
 38 | - **LON** → *longitude*: The geographic longitude coordinate for the location.
 39 | - **HEALTHCARE_EXPENSES** → *healthcare_expenses_usd*: A monetary value representing healthcare-related expenses, likely per capita or household, in USD.
 40 | - **HEALTHCARE_COVERAGE** → *healthcare_coverage_value_usd*: A monetary value related to healthcare coverage, possibly representing total premiums or insured value in the area.
 41 | - **INCOME** → *median_income_usd*: A monetary value representing the average or median income for the area, likely in USD.
 42 | 
 43 | ## Data Type Analysis
 44 | - **BIRTHDATE**: object → *datetime64* (The column contains date values in a standard 'YYYY-MM-DD' format.)
 45 | - **DEATHDATE**: object → *datetime64* (The column contains date values and empty strings, which can be represented as dates and Not a Time (NaT) values.)
 46 | - **PREFIX**: object → *category* (The column has a small number of repeated string values (e.g., 'Mr.', 'Mrs.', 'Ms.'), making it ideal for the memory-efficient category type.)
 47 | - **SUFFIX**: object → *category* (This column likely contains a small, fixed set of name suffixes (e.g. 'Jr.', 'Sr.'), making it suitable for the category type.)
 48 | - **MARITAL**: object → *category* (The column represents marital status and likely has a small number of distinct values ('M', 'S', etc.), making it ideal for the category type.)
 49 | - **RACE**: object → *category* (The column contains a small, well-defined set of values for race, which is a classic categorical variable.)
 50 | - **ETHNICITY**: object → *category* (The column contains a small, well-defined set of values for ethnicity, making it a categorical variable.)
 51 | - **GENDER**: object → *category* (The column has a very small number of distinct values ('M', 'F'), making it a prime candidate for the category type.)
 52 | - **CITY**: object → *category* (The number of unique city names is much smaller than the total number of records, making 'category' a memory-efficient choice.)
 53 | - **STATE**: object → *category* (The number of unique states is very small and fixed, making this an ideal categorical variable.)
 54 | - **COUNTY**: object → *category* (The number of unique counties is finite and much smaller than the number of records, making 'category' a memory-efficient choice.)
 55 | - **FIPS**: float64 → *category* (FIPS codes are categorical identifiers for geographic locations. Using 'category' is memory efficient and semantically correct as they are not used for mathematical operations.)
 56 | - **ZIP**: int64 → *category* (ZIP codes are geographic identifiers. While numeric, they are not used for calculations. Using 'category' is memory-efficient and avoids issues with leading zeros.)
 57 | 
 58 | ## Missing Values Analysis
 59 | **Overview**: The dataset exhibits both meaningful and problematic missingness. Fields like DEATHDATE, SUFFIX, and MAIDEN have high percentages of missing values that are expected and informative, indicating a specific status (e.g., 'alive' or 'not applicable'). Conversely, fields like MARITAL and FIPS have missing values that represent genuine data quality gaps, hindering demographic and geographic analysis.
 60 | 
 61 | ### Problematic Missing Values
 62 | - **PREFIX**: 10 missing (16.7%) - Prefixes (Mr., Ms., etc.) are often optional fields. While their absence is common, it represents incomplete data rather than a specific status, making it a minor data quality issue.
 63 | - **MARITAL**: 20 missing (33.3%) - Marital status is a core demographic attribute. A 33.3% missing rate is a significant data quality problem, as the absence does not imply a default status (like 'single') and creates gaps in analysis.
 64 | - **FIPS**: 14 missing (23.3%) - FIPS is a standardized geographic code for a county. Since COUNTY data exists, the FIPS code should be derivable. Its absence is a data processing or quality issue that hinders standardized geographic analysis.
 65 | 
 66 | ### Likely Meaningful Missing Values
 67 | - **DEATHDATE**: 50 missing (83.3%) - The high percentage of missing values (83.3%) strongly suggests that a blank DEATHDATE indicates the person is still alive. The absence of data is the data.
 68 | - **DRIVERS**: 6 missing (10.0%) - A missing driver's license number likely means the person does not have one, which could be due to age (minors) or personal choice. It is not necessarily an error.
 69 | - **PASSPORT**: 13 missing (21.7%) - Similar to a driver's license, not every individual has a passport. A missing value indicates the person likely does not possess one.
 70 | - **SUFFIX**: 56 missing (93.3%) - Name suffixes (Jr., III, etc.) are rare. The very high percentage of missing values (93.3%) correctly reflects that most people do not have one.
 71 | - **MAIDEN**: 49 missing (81.7%) - A maiden name is only applicable to a subset of the population (typically, married individuals who changed their name). A blank value is expected for males, unmarried individuals, or those who kept their original name.
 72 | 
 73 | ## Uniqueness Analysis
 74 | ### Candidate Key Columns
 75 | - **Id**: This column is a system-generated unique identifier (like a UUID) for each record. The table context states it identifies each person, and the data analysis confirms it is 100% unique. It's designed specifically to be a primary key.
 76 | - **SSN**: A Social Security Number is a government-issued number intended to be a unique identifier for each person in the United States. It is 100% unique in the sample data and is a strong candidate for a natural key, despite its sensitive nature.
 77 | 
 78 | ### Highly Unique Columns
 79 | - **BIRTHDATE**: 83.3% unique
 80 | - **DRIVERS**: 90.0% unique
 81 | - **PASSPORT**: 78.3% unique
 82 | - **FIRST**: 98.3% unique
 83 | - **LAST**: 88.3% unique
 84 | - **BIRTHPLACE**: 80.0% unique
 85 | - **ADDRESS**: 100.0% unique
 86 | - **CITY**: 66.7% unique
 87 | - **ZIP**: 60.0% unique
 88 | - **LAT**: 100.0% unique
 89 | - **LON**: 100.0% unique
 90 | - **HEALTHCARE_EXPENSES**: 100.0% unique
 91 | - **HEALTHCARE_COVERAGE**: 98.3% unique
 92 | - **INCOME**: 83.3% unique
 93 | 
 94 | ## Unusual Values Detection
 95 | - **SSN**: All sample values begin with the area number '999'. The Social Security Administration (SSA) does not issue SSNs with area numbers (the first three digits) in the 900-999 range. These values are invalid and likely represent dummy or placeholder data.
 96 | - **FIRST**: The column 'FIRST' is expected to contain first names. However, all sample values are a mix of text and numbers (e.g., 'Mel236', 'Cheyenne169'). This suggests that names have been concatenated with a numeric ID or code, which is unusual for a standard first name field.
 97 | - **LAST**: The values in the 'LAST' column consistently follow a pattern of a name followed by a three-digit number (e.g., 'Bailey598'). This is unusual because a column named 'LAST' is expected to contain only the last name. The presence of appended numbers suggests a potential data quality issue where a name and a numeric ID have been merged into a single field.
 98 | - **MAIDEN**: The values in the 'MAIDEN' column consistently follow a pattern of a name followed by a sequence of numbers (e.g., 'Lowe577'). A column representing a maiden name would typically contain only alphabetic characters. The presence of appended numbers is unusual and suggests the column may be a concatenation of a name and a numeric identifier.
 99 | - **FIPS**: The values appear to be valid 5-digit county FIPS codes. However, they are stored as floats (float64) instead of strings. FIPS codes are identifiers, not numerical quantities, and should be stored as strings to prevent issues like the loss of leading zeros (e.g., '01001' becoming 1001.0) and to reflect their categorical nature. The trailing '.0' in each sample is an artifact of this incorrect data type.
100 | - **ZIP**: The value '0' is present, which is not a valid ZIP or postal code. This value likely represents missing data, a default entry, or an error during data conversion.
101 | - **HEALTHCARE_COVERAGE**: The presence of `0.0` is unusual. It's ambiguous whether this represents a valid state (no coverage) or is a placeholder for missing data. Additionally, the data has a very wide range, and the value `1777031.06` is a potential high-end outlier, being significantly larger than the other sample values.
102 | 


--------------------------------------------------------------------------------
/test/patients.csv:
--------------------------------------------------------------------------------
 1 | Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
 2 | eb247227-e839-88d3-447d-b5972468f33b,2021-09-23,,999-41-1756,,,,Mel236,Bailey598,,,,white,nonhispanic,M,Norton Center  Massachusetts  US,716 Wunsch Gardens Unit 48,Framingham,Massachusetts,Middlesex County,25017,01701,42.27565048847629,-71.4763670033942,2520.80,4323.64,170754
 3 | 2ffa361e-5858-877e-e022-ce81fe32da1b,1944-05-31,,999-33-4589,S99957814,X45639058X,Mrs.,Cheyenne169,Marks830,,Lowe577,M,white,nonhispanic,F,Longmeadow  Massachusetts  US,123 Bayer Camp,Taunton,Massachusetts,Bristol County,25005,02718,41.89288420730215,-71.06668598167076,205342.20,94647.00,40526
 4 | 3dfb065a-67df-5b8a-3901-49bfd834bed1,2009-02-08,,999-59-2568,,,,Hunter736,Keebler762,,,,white,nonhispanic,M,Maynard  Massachusetts  US,575 Jast Rue Unit 48,Winchendon,Massachusetts,Worcester County,25027,01475,42.670059014687666,-72.07466425723803,16381.92,17447.87,79884
 5 | db80575b-5e9b-921b-fad9-1e3a20929dc7,1979-06-26,1995-07-04,999-77-7700,S99968506,,,Herschel574,Ernser583,,,,asian,nonhispanic,M,Somerville  Massachusetts  US,184 Langworth Parade Apt 10,Boston,Massachusetts,Suffolk County,25025,02131,42.39551626795498,-71.05901494925675,3850.00,44057.32,6420
 6 | d84815a3-c5b3-8ca2-025f-6323a4ec59ef,1973-05-31,,999-29-2359,S99967405,X86891718X,Mrs.,Lacey714,Heathcote539,,Hegmann834,M,white,nonhispanic,F,Natick  Massachusetts  US,801 Morissette Divide,Hingham,Massachusetts,Plymouth County,25023,02043,42.20072325055452,-70.83659045847199,66662.10,1777031.06,933420
 7 | 7ec76836-c039-d9bf-8bb9-fe488c66d452,2003-01-13,,999-42-9847,S99998925,,Ms.,Adelia946,Collier206,,,,white,nonhispanic,F,Marshfield  Massachusetts  US,459 Larson Union,Boston,Massachusetts,Suffolk County,25025,02134,42.31550631209828,-71.05169551644717,4050.00,158604.59,1361
 8 | 79297a39-2d2d-d88d-5e47-7a521af1d69f,1998-12-14,,999-49-9846,S99945605,X69843358X,Mr.,Hayden835,Casper496,,,,white,nonhispanic,M,Charlton  Massachusetts  US,589 Conroy Approach,Belmont,Massachusetts,Middlesex County,25017,02472,42.36153882121102,-71.20913616208074,40347.11,180076.32,51861
 9 | 734e5f3c-e660-6cbe-7c26-c5264cbde68e,2005-03-03,,999-71-8314,S99991875,,,Herb645,Willms744,,,,white,hispanic,M,Melrose  Massachusetts  US,980 Koss Plaza Apt 11,Brockton,Massachusetts,Plymouth County,25023,02302,42.11165412918098,-71.0259065985567,390568.25,0.00,35002
10 | 750cdaf4-c264-e967-e76b-53a5a61abcab,1983-02-18,,999-95-3792,S99957390,X63804957X,Mr.,Stewart672,Schimmel440,,,M,white,nonhispanic,M,Fall River  Massachusetts  US,843 Yost Spur Unit 81,Sharon,Massachusetts,Norfolk County,25021,02067,42.10147261542774,-71.2054748347118,7321.10,176776.70,10335
11 | 285cba54-c91d-6db4-4d78-1ea35ba6b622,1998-10-30,,999-44-2795,S99942670,X49037240X,Ms.,Jenae263,Becker968,,,,white,nonhispanic,F,Somerville  Massachusetts  US,248 Ernser Terrace Suite 86,Lynn,Massachusetts,Essex County,25009,01901,42.49850442782566,-71.03582388708702,34690.56,626729.36,56421
12 | 064ef124-22ef-af09-1940-0fec6c3574bc,1972-05-01,,999-29-7349,S99945886,X86223344X,Ms.,Andera917,Lemke654,,,S,white,nonhispanic,F,Kingston  Massachusetts  US,606 Price View Unit 89,Boston,Massachusetts,Suffolk County,25025,02116,42.428281838517776,-71.03071500996144,591573.83,346350.22,28232
13 | df6bcea7-a0c7-6ed0-e9e4-fd1dc33b76f7,1965-08-03,,999-38-7473,S99953338,X64982272X,Mrs.,Linn541,Gislason620,,Hermann103,M,white,nonhispanic,F,Westwood  Massachusetts  US,275 Tromp Burg Suite 54,Erving,Massachusetts,Franklin County,,00000,42.6366197794778,-72.38328626727915,54801.69,228252.18,83088
14 | cfa94700-7440-d5f7-516a-bae08cb365a7,2022-08-14,,999-20-5403,,,,Kaycee352,Koss676,,,,white,nonhispanic,F,New Bedford  Massachusetts  US,859 Hansen Mission Apt 56,Montague,Massachusetts,Franklin County,,00000,42.590026050563246,-72.52614825581455,1644.00,1310.64,147152
15 | 52f8df2b-25a8-fbba-af75-0e11f3a054d4,2000-10-21,,999-16-4297,S99952319,X27281996X,Ms.,Cindy893,Lueilwitz711,,,,asian,nonhispanic,F,Hanoi  Hà Đông  VN,892 Haag Gateway Unit 67,Boston,Massachusetts,Suffolk County,25025,02120,42.31347958243134,-71.1029639299087,8065.00,917055.49,3758
16 | bc1efffb-0983-081f-d4c4-3345f6f2abbd,2009-05-16,,999-43-4282,,,,Huey641,Schumm995,,,,black,nonhispanic,M,Needham  Massachusetts  US,835 Powlowski Junction Suite 1,Danvers,Massachusetts,Essex County,25009,01923,42.60647530229309,-71.02623584083403,13076.83,5093.56,74906
17 | b3b71304-fe5b-bda4-6822-bd901b2836d1,1962-05-14,,999-48-5926,S99948203,X32413718X,Mr.,Antony83,Armstrong51,,,M,white,nonhispanic,M,Fall River  Massachusetts  US,830 Dare Park Apt 34,Marshfield,Massachusetts,Plymouth County,25023,02050,42.162629807392534,-70.73981579334742,41148.04,632431.62,35673
18 | fd0b726d-b7e6-976d-7cda-8679dd849610,1965-01-01,,999-96-6743,S99949277,X14857165X,Ms.,Daniela614,Rico947,,,S,white,hispanic,F,Bayamon  Puerto Rico  PR,279 Grady Estate,Boston,Massachusetts,Suffolk County,25025,02120,42.32851940354027,-71.03026624879821,54097.09,478630.93,93689
19 | 53534989-404e-cc7c-2859-1708edba296c,1959-05-28,,999-14-9672,S99998623,X86168167X,Mr.,Fletcher87,O'Conner199,,,S,white,nonhispanic,M,Wellesley  Massachusetts  US,873 Ledner Hollow Unit 28,Gardner,Massachusetts,Worcester County,25027,01440,42.56780989156895,-72.00057697137188,329359.12,160957.56,40192
20 | fca2a21e-3319-131a-7e84-ff984b871e16,1979-06-26,,999-29-4844,S99938158,X13579933X,Mr.,Kirk871,Nolan344,,,S,asian,nonhispanic,M,Billerica  Massachusetts  US,356 Wintheiser Passage,Boston,Massachusetts,Suffolk County,25025,02109,42.382825730736656,-71.060338397059,12300.34,775237.82,6420
21 | d89557ad-d741-8ea5-b542-c1226a781d83,1963-12-09,,999-22-2635,S99975865,X35559645X,Mr.,Steve819,Brakus656,,,M,white,nonhispanic,M,Quincy  Massachusetts  US,949 Langworth Light Apt 7,Yarmouth,Massachusetts,Barnstable County,,00000,41.67994080784203,-70.22659782831724,517636.61,2883.47,51527
22 | c8fbb10b-b54e-8182-d71c-c552bd1c58b1,1976-04-03,,999-50-5697,S99983337,X69114774X,Mr.,Hayden835,Schumm995,,,S,white,nonhispanic,M,Braintree  Massachusetts  US,856 Gusikowski Lane,North Adams,Massachusetts,Berkshire County,25003,01247,42.63685002791058,-73.08831197031574,190214.57,27591.19,36603
23 | 1d13ebc3-0635-059a-5fe9-82c92ede84ec,2006-05-08,,999-79-3695,S99923330,,,Moises22,O'Conner199,,,,white,hispanic,M,Saugus  Massachusetts  US,459 Cassin Forge Suite 9,Baldwinville,Massachusetts,Worcester County,25027,01436,42.603196202455976,-72.08703841216905,20035.70,11871.97,69174
24 | 0cd0df97-9d92-5d95-fbde-6b0a7e6af1c8,1932-07-09,1941-07-07,999-55-6098,,,,Coleman27,Kreiger457,,,,white,nonhispanic,M,Boston  Massachusetts  US,785 Ankunding Drive,Scituate,Massachusetts,Plymouth County,25023,02066,42.18599323192734,-70.79923081726402,9206.41,38661.38,870606
25 | fa451eba-6815-0d99-fd91-02f5581d914b,1946-11-12,,999-79-2426,S99940159,X53572828X,Ms.,Lillia547,Nolan344,,,S,white,nonhispanic,F,Chelsea  Massachusetts  US,287 Medhurst Bypass,Saugus,Massachusetts,Essex County,25009,01906,42.42536923970955,-71.00203089714843,64688.84,1193878.90,99091
26 | 92bc26b1-c317-7db6-492e-3c8ea452b36d,2005-04-22,,999-57-7157,S99990370,,,Vanesa40,Anderson154,,,,white,nonhispanic,F,Lawrence  Massachusetts  US,990 Hyatt Gateway,Chicopee,Massachusetts,Hampden County,25013,01013,42.20443967658899,-72.59684936418238,23076.68,14653.34,27706
27 | 8cfec0eb-f022-f332-55f5-38a2c35f5b84,2002-08-14,,999-93-7263,S99928764,X60540972X,Ms.,Mirta419,Hayes766,,,,white,nonhispanic,F,Boston  Massachusetts  US,737 Hauck Estate,Holliston,Massachusetts,Middlesex County,,00000,42.19537880557404,-71.40405413382206,11458.37,763952.25,17389
28 | 27948426-0f88-0e3b-dd6b-8bd9d8512892,1991-04-23,,999-99-2106,S99915210,X21909602X,Ms.,Shiela18,Jenkins714,,,S,white,nonhispanic,F,Methuen  Massachusetts  US,931 Lowe Route,Boston,Massachusetts,Suffolk County,25025,02111,42.366094173491376,-71.0766434184029,37489.68,507808.67,172784
29 | 548300bb-3152-531c-d895-d44fbf2ff1ba,1972-10-14,,999-17-2611,S99989489,X58425716X,Mrs.,Christal240,Hoppe518,,Gorczany269,M,white,nonhispanic,F,Boston  Massachusetts  US,298 Ryan Corner Suite 66,Sandwich,Massachusetts,Barnstable County,25001,02563,41.710640404859575,-70.46063787198887,53359.37,719097.64,91140
30 | 821960e1-9db8-7b56-a359-ac34d9228fbc,1960-04-26,,999-61-6611,S99976250,X16809119X,Mrs.,Guadalupe206,Bermúdez789,,Barela183,M,white,hispanic,F,Ponce  Puerto Rico  PR,336 Nienow Course,Tyngsborough,Massachusetts,Middlesex County,,00000,42.66681617186233,-71.47460669911267,67147.48,794967.24,64210
31 | 100881d9-7b59-8060-2772-f41b276970fc,1985-01-06,,999-89-9127,S99998550,X39158329X,Mrs.,Corey514,Johnson679,,Beier427,M,white,nonhispanic,F,Norton  Massachusetts  US,412 Spinka Plaza,Quincy,Massachusetts,Norfolk County,25021,02171,42.2627318928807,-71.01242779795183,41414.26,543793.88,129073
32 | d4db8ae1-3354-d064-508d-834cfa214cb2,1958-07-12,,999-35-8448,S99991458,X15138167X,Mr.,Gayle448,MacGyver246,,,M,white,nonhispanic,M,Framingham  Massachusetts  US,138 Hilll Well,Yarmouth,Massachusetts,Barnstable County,,00000,41.679741071089055,-70.17797861520093,79882.99,32342.26,21813
33 | d7f0610d-ec4d-fb9e-2d0d-b54e3ad621fb,1978-12-22,,999-88-1792,S99963549,X35369763X,Mr.,Whitney250,Hamill307,,,M,white,hispanic,M,Walpole  Massachusetts  US,416 Oberbrunner Dam Apt 95,Worcester,Massachusetts,Worcester County,25027,01603,42.29434089325724,-71.80056312559488,36814.41,239.57,44580
34 | e8323f7c-6829-3bce-0621-63d588b2e901,2010-09-21,,999-32-4862,,,,Dione665,Wilkinson796,,,,white,nonhispanic,F,Grafton  Massachusetts  US,455 Rutherford Lock,Lawrence,Massachusetts,Essex County,25009,01841,42.65392733597755,-71.1397130605323,13474.46,27265.47,64612
35 | 2995bf9b-5760-2099-77fe-ba01250cec42,1953-05-07,,999-10-1178,S99915523,X21910865X,Ms.,Shayla126,Rath779,,,S,white,nonhispanic,F,Fitchburg  Massachusetts  US,106 Hane Skyway Suite 0,Hampden,Massachusetts,Hampden County,,00000,42.07986301784488,-72.43670689724918,16242.76,800689.76,15098
36 | b4ea2bfe-cd6b-92a6-ff78-d2e995243894,1932-07-09,1966-11-11,999-70-2405,S99928700,X87718021X,Mr.,Damon455,Kshlerin58,JD,,S,white,nonhispanic,M,Cambridge  Massachusetts  US,803 Powlowski Park,Scituate,Massachusetts,Plymouth County,25023,02066,42.16374680818888,-70.80708512636961,23445.48,18839.51,870606
37 | ee98453d-79ed-910b-2e4a-9b32d9350fb6,1991-03-13,,999-70-4594,S99922993,X63601019X,Mr.,Luke971,Trantow673,,,M,asian,nonhispanic,M,Haiphong  Kiến An  VN,653 Jones Run Suite 14,Leominster,Massachusetts,Worcester County,25027,01453,42.55158257952831,-71.7700198666129,24117.54,26060.37,139930
38 | 98308074-8188-7b69-a1d1-be735cdc3ff4,1997-08-01,,999-14-9380,S99982480,X30896463X,Ms.,Carrol931,Rutherford999,,,,white,nonhispanic,F,Malden  Massachusetts  US,401 Reichel Route Suite 47,Pittsfield,Massachusetts,Berkshire County,25003,01201,42.414512991574455,-73.30118644516655,17732.85,963178.27,143644
39 | 2333e462-582c-9c83-d382-4c5e0c2c1ad0,2000-09-29,,999-58-7543,S99923245,X14074343X,Ms.,Lavette209,Zboncak558,,,,black,nonhispanic,F,Medfield  Massachusetts  US,994 Feest Crossroad Apt 13,Marblehead,Massachusetts,Essex County,25009,01945,42.49472255145156,-70.81702859735168,31189.37,595880.21,129433
40 | 931c7fd6-6330-1008-cef4-df84dd836d15,2002-10-04,,999-26-4422,S99938533,X25071442X,Mr.,Burton124,Stehr398,,,,white,nonhispanic,M,Taunton  Massachusetts  US,619 Upton Landing Apt 9,Chelmsford,Massachusetts,Middlesex County,,00000,42.548797580393405,-71.3540262724428,25994.44,467794.53,185360
41 | b3f13b30-5802-e5f3-685b-36c3c09283f1,2003-03-10,,999-20-4271,S99992852,,Ms.,Lessie363,Langworth352,,,,white,nonhispanic,F,Lynnfield  Massachusetts  US,745 Koelpin Trailer,Westborough,Massachusetts,Worcester County,25027,01581,42.30108671835689,-71.57704229893056,5139.91,88026.45,9657
42 | 4860e9a0-1263-6ca4-fe42-b7a73cbeec16,1947-12-14,2011-08-10,999-37-8682,S99927544,X81110019X,Mr.,Dominick530,Mills423,,,M,white,nonhispanic,M,Lynn  Massachusetts  US,1025 Spinka Overpass Suite 19,Wilbraham,Massachusetts,Hampden County,25013,01095,42.11528427077081,-72.45768605820261,58407.08,40127.28,326248
43 | 87424a5e-7848-aed5-fd59-4c8a76c2ed36,1965-08-08,,999-36-1150,S99945201,X58343864X,Mrs.,Zoila41,McGlynn426,,DuBuque211,M,white,nonhispanic,F,Brookline  Massachusetts  US,867 Langosh Grove Apt 84,Fairhaven,Massachusetts,Bristol County,,00000,41.63250541278397,-70.87313339013353,15950.00,1425035.79,15144
44 | 26a90721-54f3-b755-ecf4-a8aab978c01c,1963-02-27,,999-55-3195,S99981647,X12176676X,Mr.,Martín25,Roldán470,,,M,white,hispanic,M,Buenos Aires  Ciudad de Buenos Aires  AR,382 Satterfield Annex Suite 45,Ludlow,Massachusetts,Hampden County,,00000,42.219739387326875,-72.45861669907123,829967.44,0.00,49667
45 | 06d7ef99-093b-fe84-6d7c-52a3eab126fe,1955-04-04,,999-99-2436,S99953324,X10127197X,Ms.,Yetta429,Doyle959,,,S,white,nonhispanic,F,Grafton  Massachusetts  US,762 Senger Lodge,New Bedford,Massachusetts,Bristol County,25005,02740,41.62800607094596,-70.98717778570096,1079260.67,33271.04,49980
46 | 74d4ca38-9f05-2212-c539-44139fdd8ab4,2003-08-20,,999-44-9634,S99977402,,Mr.,Elden718,Collins926,,,,white,nonhispanic,M,Salem  Massachusetts  US,343 Reynolds Lock Unit 95,Cambridge,Massachusetts,Middlesex County,25017,02141,42.41937500378478,-71.10280714780403,7303.71,44500.70,16133
47 | f56c230b-3a7c-aca2-6363-fa3d46cf6596,1985-02-05,,999-51-3221,S99937081,X44375219X,Mrs.,Shaquana156,MacGyver246,,Deckow585,M,black,nonhispanic,F,Brockton  Massachusetts  US,282 Wintheiser Quay Suite 46,Sharon,Massachusetts,Norfolk County,25021,02067,42.07048110807395,-71.20906020387682,45950.72,726541.39,397408
48 | 37279a07-035d-e18b-bcd7-331dc3fe6304,1975-08-17,,999-91-9580,S99984964,X59848524X,Mr.,Jimmie93,Graham902,,,M,asian,nonhispanic,M,Beijing  Beijing Municipality  CN,269 Jones Estate Apt 87,Lynn,Massachusetts,Essex County,25009,01901,42.540185973577316,-70.95005850290717,50890.22,105568.63,58204
49 | 2d5a8517-f25f-9f66-9ab8-0a69425145c7,1960-04-25,,999-62-7937,S99914972,X40059543X,Mr.,Manuel446,Quitzon246,,,M,white,nonhispanic,M,New Bedford  Massachusetts  US,828 Hahn Ferry Suite 62,Raynham,Massachusetts,Bristol County,,00000,41.93105766594213,-71.08785486712216,54306.03,143291.33,77316
50 | 1666a800-a041-a2ca-4f9b-af668e740370,1963-06-05,,999-30-8851,S99915586,X37464865X,Mr.,Grant908,Hahn503,,,M,white,nonhispanic,M,Boston  Massachusetts  US,168 Ernser Viaduct Apt 32,Cambridge,Massachusetts,Middlesex County,25017,02141,42.33909824190683,-71.11622581495207,51477.06,68767.89,224975
51 | 8762040e-69bb-6ac8-685c-1d63a3d4dfe2,1960-02-25,,999-10-6028,S99958879,X62869090X,Ms.,Carmelita854,Hagenes547,,,S,white,nonhispanic,F,Taunton  Massachusetts  US,900 McClure Fort,Salem,Massachusetts,Essex County,25009,01970,42.52904093176596,-70.86167782746726,16750.00,1561478.75,13488
52 | 48334e94-64e7-91e8-d91b-1246110bf1ba,1932-07-09,2004-12-07,999-23-9351,S99916924,X14993625X,Mr.,Leandro563,Hane680,JD,,M,white,nonhispanic,M,Duxbury  Massachusetts  US,166 Jerde Avenue,Scituate,Massachusetts,Plymouth County,25023,02066,42.25402035749961,-70.7386488899729,62366.99,201314.86,870606
53 | a68f0fcf-424b-d8dd-3949-a69f0f3f9979,1935-09-06,2020-07-09,999-12-9121,S99987192,X75313923X,Mrs.,Janeth814,Feest103,,Shanahan202,M,white,nonhispanic,F,Boxford  Massachusetts  US,175 Mayer Frontage road Apt 63,East Longmeadow,Massachusetts,Hampden County,,00000,42.04366398532018,-72.53842235808679,909196.90,370234.71,29277
54 | 4f233603-d38e-fec1-7106-b6a09c62f28e,1947-12-14,2012-02-06,999-61-6740,S99975126,X69639917X,Mr.,Raymon366,Orn563,,,M,white,nonhispanic,M,East Bridgewater  Massachusetts  US,144 Waters Bypass Apt 13,Wilbraham,Massachusetts,Hampden County,25013,01095,42.1070174210795,-72.46946143213026,61483.39,148955.95,326248
55 | 561bc09a-56b9-859e-b926-fc66685d9df1,1946-03-21,,999-73-5643,S99995586,X13967823X,Mr.,Tyrell880,Schimmel440,,,M,white,nonhispanic,M,Georgetown  Massachusetts  US,102 Waters Estate Unit 93,New Bedford,Massachusetts,Bristol County,25005,02743,41.76092573197442,-70.93429596770599,69632.06,271187.57,62948
56 | 0c603e3d-ff1b-936d-14aa-9e875fa47cad,1932-07-09,2012-05-11,999-85-2178,S99970498,X20385492X,Mr.,Irving123,Hamill307,PhD,,M,white,nonhispanic,M,Kingston  Massachusetts  US,880 Bauch Lodge,Scituate,Massachusetts,Plymouth County,25023,02066,42.1894755210766,-70.72442915485766,67284.69,105065.45,870606
57 | da069417-667e-3b7e-8730-b00bf5dbcd7f,1935-09-06,2015-02-25,999-50-5586,S99935886,X71918603X,Mrs.,Ethel888,Corwin846,,Predovic534,M,white,nonhispanic,F,New Bedford  Massachusetts  US,344 Miller Street,East Longmeadow,Massachusetts,Hampden County,,00000,42.06043518424488,-72.46281354520333,724339.28,171776.51,29277
58 | dd19ae3a-2f3a-a636-f2bf-f3fe51e3ff7d,1947-12-14,2011-07-21,999-61-4140,S99946445,X63265370X,Mr.,Cedrick207,Cruickshank494,,,M,white,nonhispanic,M,Stoneham  Massachusetts  US,1051 Brakus Center Unit 33,Wilbraham,Massachusetts,Hampden County,25013,01095,42.153689936905664,-72.46563770143045,45887.05,1111058.21,326248
59 | 86636875-af39-df1b-edd7-209e8ffb77d2,1932-07-09,,999-26-8041,S99949411,X80854357X,Mr.,Garry927,Nikolaus26,MD,,M,white,nonhispanic,M,Rockland  Massachusetts  US,609 Paucek Skyway,Scituate,Massachusetts,Plymouth County,25023,02066,42.21737593221302,-70.71331268909627,53754.99,77450.36,870606
60 | 5dbb559c-7f46-f0bb-58ad-9069d1cce9b4,1935-09-06,,999-23-4696,S99949724,X57424902X,Mrs.,Hildred696,Casper496,,Kohler843,M,white,nonhispanic,F,Malden  Massachusetts  US,638 Bradtke Hollow,East Longmeadow,Massachusetts,Hampden County,,00000,42.04728057003359,-72.49659986178744,983859.74,213860.48,29277
61 | 9cfb6988-97da-cc56-58d4-81be46378f43,1947-12-14,,999-56-9201,S99974442,X17350166X,Mr.,Lonny638,Klein929,,,M,white,nonhispanic,M,Maynard  Massachusetts  US,848 Kuphal Junction,Wilbraham,Massachusetts,Hampden County,25013,01095,42.13813414111617,-72.45483019554929,71706.16,392055.44,326248
62 | 


--------------------------------------------------------------------------------
/nodes.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import yaml
  3 | from pocketflow import Node, BatchNode
  4 | from utils.call_llm import call_llm
  5 | 
  6 | def truncate_cell(value, max_length=50):
  7 |     """Truncate cell values for display purposes"""
  8 |     if pd.isna(value):
  9 |         return value
 10 |     str_value = str(value)
 11 |     if len(str_value) > max_length:
 12 |         return str_value[:max_length] + "..."
 13 |     return str_value
 14 | 
 15 | 
 16 | class DuplicateDetectionNode(Node):
 17 |     def prep(self, shared):
 18 |         df = shared["dataframe"]
 19 |         
 20 |         # Find duplicate rows
 21 |         duplicate_rows = df[df.duplicated(keep=False)]
 22 |         duplicate_count = len(duplicate_rows) - len(duplicate_rows.drop_duplicates())
 23 |         duplicate_percentage = (duplicate_count / len(df)) * 100 if len(df) > 0 else 0
 24 |         
 25 |         # Get sample of duplicate rows for LLM analysis
 26 |         sample_duplicates = ""
 27 |         if duplicate_count > 0:
 28 |             sample_df = duplicate_rows.head(10).applymap(truncate_cell)
 29 |             sample_duplicates = sample_df.to_csv(index=False, quoting=1)
 30 |         
 31 |         # Get basic table info for context
 32 |         table_sample = df.head(5).applymap(truncate_cell).to_csv(index=False, quoting=1)
 33 |         
 34 |         return {
 35 |             "duplicate_count": duplicate_count,
 36 |             "duplicate_percentage": duplicate_percentage,
 37 |             "total_rows": len(df),
 38 |             "sample_duplicates": sample_duplicates,
 39 |             "table_sample": table_sample
 40 |         }
 41 | 
 42 |     def exec(self, prep_res):
 43 |         if prep_res["duplicate_count"] == 0:
 44 |             return {
 45 |                 "should_remove": False,
 46 |                 "analysis": "No duplicate rows found in the dataset."
 47 |             }
 48 |         
 49 |         prompt = f"""
 50 | You have a table with {prep_res["total_rows"]} total rows and {prep_res["duplicate_count"]} duplicate rows ({prep_res["duplicate_percentage"]:.2f}%).
 51 | 
 52 | Sample of the table:
 53 | {prep_res["table_sample"]}
 54 | 
 55 | Sample duplicate rows:
 56 | {prep_res["sample_duplicates"]}
 57 | 
 58 | Analyze these duplicates and decide whether they should be removed.
 59 | 
 60 | Return in YAML format:
 61 | ```yaml
 62 | should_remove: true/false
 63 | analysis: "Brief analysis explaining why duplicates should/shouldn't be removed"
 64 | ```
 65 | """
 66 |         
 67 |         response = call_llm(prompt)
 68 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
 69 |         result = yaml.safe_load(yaml_str)
 70 |         
 71 |         assert "should_remove" in result
 72 |         assert "analysis" in result
 73 |         assert isinstance(result["should_remove"], bool)
 74 |         assert isinstance(result["analysis"], str)
 75 |         
 76 |         return result
 77 | 
 78 |     def post(self, shared, prep_res, exec_res):
 79 |         shared["profile_results"]["duplicates"] = {
 80 |             "count": prep_res["duplicate_count"],
 81 |             "percentage": prep_res["duplicate_percentage"],
 82 |             "total_rows": prep_res["total_rows"],
 83 |             "should_remove": exec_res["should_remove"],
 84 |             "analysis": exec_res["analysis"],
 85 |             "sample_rows": prep_res["sample_duplicates"]
 86 |         }
 87 | 
 88 | class TableSummaryNode(Node):
 89 |     def prep(self, shared):
 90 |         df = shared["dataframe"]
 91 |         
 92 |         # Create a sample for LLM analysis
 93 |         sample_df = df.head(50).applymap(truncate_cell)
 94 |         sample_data = sample_df.to_csv(index=False, quoting=1)
 95 |         
 96 |         # Basic info
 97 |         column_names = list(df.columns)
 98 |         row_count = len(df)
 99 |         
100 |         return {
101 |             "sample_data": sample_data,
102 |             "column_names": column_names,
103 |             "row_count": row_count
104 |         }
105 | 
106 |     def exec(self, prep_res):
107 |         columns_str = ", ".join(prep_res["column_names"])
108 |         
109 |         prompt = f"""
110 | You have a table with {prep_res["row_count"]} rows and the following columns: {columns_str}
111 | 
112 | Sample data:
113 | {prep_res["sample_data"]}
114 | 
115 | Task: Summarize what this table represents.
116 | - Highlight: Include and highlight ALL column names as **Column_Name**
117 | - Structure: Start with the big picture, then explain how columns are related
118 | - Requirement: ALL column names must be mentioned and **highlighted**. Use exact column names (case sensitive)
119 | - Style: Use a few short sentences with simple words
120 | 
121 | Example: "The table contains information about ... with **Customer_ID**, **Order_Date**, and **Amount**..."
122 | 
123 | Your summary:
124 | """
125 |         
126 |         return call_llm(prompt)
127 | 
128 |     def post(self, shared, prep_res, exec_res):
129 |         shared["profile_results"]["table_summary"] = exec_res
130 |         return "default"
131 | 
132 | class ColumnDescriptionNode(BatchNode):
133 |     def prep(self, shared):
134 |         df = shared["dataframe"]
135 |         columns = list(df.columns)
136 |         
137 |         # Process columns in chunks of 10
138 |         chunks = []
139 |         for i in range(0, len(columns), 10):
140 |             chunk_columns = columns[i:i + 10]
141 |             chunk_df = df[chunk_columns].head(5).applymap(truncate_cell)
142 |             chunk_sample = chunk_df.to_csv(index=False, quoting=1)
143 |             chunks.append((chunk_columns, chunk_sample))
144 |         
145 |         return chunks
146 | 
147 |     def exec(self, chunk_data):
148 |         chunk_columns, chunk_sample = chunk_data
149 |         
150 |         prompt = f"""
151 | You have the following table columns and sample data:
152 | {chunk_sample}
153 | 
154 | For each column, provide a short description and suggest a better name if needed.
155 | 
156 | Return in YAML format:
157 | ```yaml
158 | {chunk_columns[0]}:
159 |   description: "Short description"
160 |   suggested_name: "new_column_name"
161 | ...
162 | ```
163 | """
164 |         
165 |         response = call_llm(prompt)
166 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
167 |         result = yaml.safe_load(yaml_str)
168 |         
169 |         # Validate all columns are present with required fields
170 |         for col in chunk_columns:
171 |             assert col in result, f"Column {col} missing from result"
172 |             assert "description" in result[col], f"Description missing for {col}"
173 |             assert "suggested_name" in result[col], f"Suggested name missing for {col}"
174 |             assert isinstance(result[col]["description"], str)
175 |             assert isinstance(result[col]["suggested_name"], str)
176 |         
177 |         return result
178 | 
179 |     def post(self, shared, prep_res, exec_res_list):
180 |         # Combine results from all chunks
181 |         all_descriptions = {}
182 |         for chunk_result in exec_res_list:
183 |             all_descriptions.update(chunk_result)
184 |         
185 |         # Convert to the expected format (now already in the right structure from YAML)
186 |         shared["profile_results"]["column_descriptions"] = all_descriptions
187 |         return "default"
188 | 
189 | class DataTypeAnalysisNode(Node):
190 |     def prep(self, shared):
191 |         df = shared["dataframe"]
192 |         
193 |         # Get current data types
194 |         current_types = {col: str(df[col].dtype) for col in df.columns}
195 |         
196 |         # Get sample data
197 |         sample_df = df.head(10).applymap(truncate_cell)
198 |         sample_data = sample_df.to_csv(index=False, quoting=1)
199 |         
200 |         return {
201 |             "sample_data": sample_data,
202 |             "current_types": current_types,
203 |             "columns": list(df.columns)
204 |         }
205 | 
206 |     def exec(self, prep_res):
207 |         types_info = "\n".join([f"{col}: currently {dtype}" for col, dtype in prep_res["current_types"].items()])
208 |         valid_types = ["int64", "float64", "object", "datetime64", "bool", "category"]
209 |         
210 |         prompt = f"""
211 | You have the following table with current data types:
212 | {types_info}
213 | 
214 | Sample data:
215 | {prep_res["sample_data"]}
216 | 
217 | For each column, suggest the most appropriate data type from: {valid_types}
218 | 
219 | Return in YAML format:
220 | ```yaml
221 | column1:
222 |   suggested_type: "int64"
223 |   reason: "Contains only integer values"
224 | ...
225 | ```
226 | """
227 |         
228 |         response = call_llm(prompt)
229 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
230 |         result = yaml.safe_load(yaml_str)
231 |         
232 |         # Validate all columns are present with required fields
233 |         for col in prep_res["columns"]:
234 |             assert col in result, f"Column {col} missing from result"
235 |             assert "suggested_type" in result[col], f"Suggested type missing for {col}"
236 |             assert "reason" in result[col], f"Reason missing for {col}"
237 |             assert result[col]["suggested_type"] in valid_types, f"Invalid type for {col}: {result[col]['suggested_type']}"
238 |             assert isinstance(result[col]["reason"], str)
239 |         
240 |         return result
241 | 
242 |     def post(self, shared, prep_res, exec_res):
243 |         # Combine current and suggested types
244 |         data_types = {}
245 |         for col in prep_res["columns"]:
246 |             data_types[col] = {
247 |                 "current_type": prep_res["current_types"][col],
248 |                 "suggested_type": exec_res[col]["suggested_type"],
249 |                 "reason": exec_res[col]["reason"]
250 |             }
251 |         
252 |         shared["profile_results"]["data_types"] = data_types
253 |         return "default"
254 | 
255 | class MissingValuesAnalysisNode(Node):
256 |     def prep(self, shared):
257 |         df = shared["dataframe"]
258 |         
259 |         # Calculate missing values
260 |         missing_info = {}
261 |         for col in df.columns:
262 |             missing_count = df[col].isna().sum()
263 |             if missing_count > 0:
264 |                 missing_percentage = (missing_count / len(df)) * 100
265 |                 missing_info[col] = {
266 |                     "count": missing_count,
267 |                     "percentage": missing_percentage
268 |                 }
269 |         
270 |         # Get sample data
271 |         sample_df = df.head(10).applymap(truncate_cell)
272 |         sample_data = sample_df.to_csv(index=False, quoting=1)
273 |         
274 |         return {
275 |             "missing_info": missing_info,
276 |             "sample_data": sample_data,
277 |             "total_rows": len(df)
278 |         }
279 | 
280 |     def exec(self, prep_res):
281 |         if not prep_res["missing_info"]:
282 |             return {
283 |                 "reasoning": "No missing values found in any columns.",
284 |                 "columns_analysis": {}
285 |             }
286 |         
287 |         missing_desc = "\n".join([
288 |             f"{col}: {info['count']} missing ({info['percentage']:.1f}%)" 
289 |             for col, info in prep_res["missing_info"].items()
290 |         ])
291 |         
292 |         prompt = f"""
293 | You have a table with the following missing values:
294 | {missing_desc}
295 | 
296 | Sample data for context:
297 | {prep_res["sample_data"]}
298 | 
299 | For each column with missing values, determine if missing values are meaningful or problematic.
300 | 
301 | Return in YAML format:
302 | ```yaml
303 | overall_analysis: "Brief overall analysis"
304 | columns:
305 |   column_name:
306 |     is_meaningful: true/false
307 |     reason: "Brief explanation"
308 |   ...
309 | ```
310 | """
311 |         
312 |         response = call_llm(prompt)
313 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
314 |         result = yaml.safe_load(yaml_str)
315 |         
316 |         # Validate structure
317 |         assert "overall_analysis" in result
318 |         assert "columns" in result
319 |         assert isinstance(result["overall_analysis"], str)
320 |         assert isinstance(result["columns"], dict)
321 |         
322 |         # Validate each column analysis
323 |         for col in prep_res["missing_info"].keys():
324 |             assert col in result["columns"], f"Missing analysis for column {col}"
325 |             assert "is_meaningful" in result["columns"][col]
326 |             assert "reason" in result["columns"][col]
327 |             assert isinstance(result["columns"][col]["is_meaningful"], bool)
328 |             assert isinstance(result["columns"][col]["reason"], str)
329 |         
330 |         return result
331 | 
332 |     def post(self, shared, prep_res, exec_res):
333 |         missing_values = {}
334 |         
335 |         # Process columns with missing values
336 |         for col, info in prep_res["missing_info"].items():
337 |             analysis = exec_res["columns"][col]
338 |             missing_values[col] = {
339 |                 "count": info["count"],
340 |                 "percentage": info["percentage"],
341 |                 "is_meaningful": analysis["is_meaningful"],
342 |                 "reason": analysis["reason"]
343 |             }
344 |         
345 |         # Add columns with no missing values
346 |         df = shared["dataframe"]
347 |         for col in df.columns:
348 |             if col not in missing_values:
349 |                 missing_values[col] = {
350 |                     "count": 0,
351 |                     "percentage": 0.0,
352 |                     "is_meaningful": True,
353 |                     "reason": "No missing values"
354 |                 }
355 |         
356 |         shared["profile_results"]["missing_values"] = missing_values
357 |         shared["profile_results"]["missing_analysis"] = exec_res["overall_analysis"]
358 |         return "default"
359 | 
360 | class UniquenessAnalysisNode(Node):
361 |     def prep(self, shared):
362 |         df = shared["dataframe"]
363 |         
364 |         # Calculate uniqueness for each column
365 |         uniqueness_info = {}
366 |         for col in df.columns:
367 |             unique_count = df[col].nunique()
368 |             total_count = len(df)
369 |             unique_percentage = (unique_count / total_count) * 100 if total_count > 0 else 0
370 |             
371 |             uniqueness_info[col] = {
372 |                 "unique_count": unique_count,
373 |                 "total_count": total_count,
374 |                 "unique_percentage": unique_percentage
375 |             }
376 |         
377 |         # Get sample data and table summary for context
378 |         sample_df = df.head(10).applymap(truncate_cell)
379 |         sample_data = sample_df.to_csv(index=False, quoting=1)
380 |         table_summary = shared["profile_results"].get("table_summary", "")
381 |         
382 |         # Get highly unique columns (>90% unique)
383 |         highly_unique = {col: info for col, info in uniqueness_info.items() 
384 |                         if info["unique_percentage"] > 90}
385 |         
386 |         return {
387 |             "uniqueness_info": uniqueness_info,
388 |             "highly_unique": highly_unique,
389 |             "sample_data": sample_data,
390 |             "table_summary": table_summary
391 |         }
392 | 
393 |     def exec(self, prep_res):
394 |         if not prep_res["highly_unique"]:
395 |             return {
396 |                 "reasoning": "No columns found that could serve as candidate keys.",
397 |                 "candidate_keys": {}
398 |             }
399 |         
400 |         highly_unique_desc = "\n".join([
401 |             f"{col}: {info['unique_count']}/{info['total_count']} unique ({info['unique_percentage']:.1f}%)"
402 |             for col, info in prep_res["highly_unique"].items()
403 |         ])
404 |         
405 |         prompt = f"""
406 | Table context: {prep_res["table_summary"]}
407 | 
408 | Sample data:
409 | {prep_res["sample_data"]}
410 | 
411 | The following columns have high uniqueness:
412 | {highly_unique_desc}
413 | 
414 | Analyze which columns could serve as candidate keys (unique identifiers) for this table.
415 | Consider:
416 | - What each row represents in this table
417 | - Whether the column values should be unique across all rows
418 | - Avoid continuous numerical values (like temperatures, prices) that happen to be unique in the sample
419 | 
420 | Return in YAML format:
421 | ```yaml
422 | reasoning: "Analysis of which columns can serve as identifiers..."
423 | candidate_keys:
424 |   column_name:
425 |     is_candidate_key: true/false
426 |     explanation: "Why this column is/isn't a good candidate key"
427 |   ...
428 | ```
429 | """
430 |         
431 |         response = call_llm(prompt)
432 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
433 |         return yaml.safe_load(yaml_str)
434 | 
435 |     def post(self, shared, prep_res, exec_res):
436 |         uniqueness = {}
437 |         
438 |         for col, info in prep_res["uniqueness_info"].items():
439 |             candidate_analysis = exec_res.get("candidate_keys", {}).get(col, {})
440 |             uniqueness[col] = {
441 |                 "unique_count": info["unique_count"],
442 |                 "unique_percentage": info["unique_percentage"],
443 |                 "is_candidate_key": candidate_analysis.get("is_candidate_key", False),
444 |                 "explanation": candidate_analysis.get("explanation", "")
445 |             }
446 |         
447 |         shared["profile_results"]["uniqueness"] = uniqueness
448 |         shared["profile_results"]["uniqueness_reasoning"] = exec_res.get("reasoning", "")
449 |         return "default"
450 | 
451 | class UnusualValuesDetectionNode(BatchNode):
452 |     def prep(self, shared):
453 |         df = shared["dataframe"]
454 |         columns = list(df.columns)
455 |         
456 |         # Create analysis tasks for each column
457 |         column_tasks = []
458 |         for col in columns:
459 |             # Get sample of distinct values (up to 1000 for inspection)
460 |             sample_values = df[col].dropna().drop_duplicates().head(1000)
461 |             sample_list = [truncate_cell(val, 100) for val in sample_values]
462 |             
463 |             column_tasks.append({
464 |                 "column_name": col,
465 |                 "sample_values": sample_list,
466 |                 "data_type": str(df[col].dtype)
467 |             })
468 |         
469 |         return column_tasks
470 | 
471 |     def exec(self, column_task):
472 |         col_name = column_task["column_name"]
473 |         sample_values = column_task["sample_values"]
474 |         data_type = column_task["data_type"]
475 |         
476 |         if not sample_values:
477 |             return {
478 |                 "column_name": col_name,
479 |                 "has_unusual": False,
480 |                 "explanation": "No values to analyze (all missing)"
481 |             }
482 |         
483 |         values_str = ", ".join([f"'{val}'" for val in sample_values[:15]])
484 |         
485 |         prompt = f"""
486 | Column "{col_name}" (type: {data_type}) has the following sample values:
487 | {values_str}
488 | 
489 | Check if there are any unusual values that seem wrong or inconsistent.
490 | 
491 | Return in YAML format:
492 | ```yaml
493 | has_unusual: true/false
494 | explanation: "Brief explanation of findings"
495 | ```
496 | """
497 |         
498 |         response = call_llm(prompt)
499 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
500 |         result = yaml.safe_load(yaml_str)
501 |         
502 |         # Validate structure
503 |         assert "has_unusual" in result
504 |         assert "explanation" in result
505 |         assert isinstance(result["has_unusual"], bool)
506 |         assert isinstance(result["explanation"], str)
507 |         
508 |         result["column_name"] = col_name
509 |         return result
510 | 
511 |     def post(self, shared, prep_res, exec_res_list):
512 |         unusual_values = {}
513 |         
514 |         for result in exec_res_list:
515 |             col_name = result["column_name"]
516 |             unusual_values[col_name] = {
517 |                 "has_unusual": result["has_unusual"],
518 |                 "explanation": result["explanation"]
519 |             }
520 |         
521 |         shared["profile_results"]["unusual_values"] = unusual_values
522 |         return "default"
523 | 
524 | class GenerateReportNode(Node):
525 |     def prep(self, shared):
526 |         return shared["profile_results"]
527 | 
528 |     def exec(self, profile_results):
529 |         # Generate a comprehensive report
530 |         report_sections = []
531 |         
532 |         # Title
533 |         report_sections.append("# Data Profiling Report\n")
534 |         
535 |         # Table Summary
536 |         if "table_summary" in profile_results:
537 |             report_sections.append("## Table Summary")
538 |             report_sections.append(profile_results["table_summary"])
539 |             report_sections.append("")
540 |         
541 |         # Duplicates
542 |         if "duplicates" in profile_results:
543 |             dup = profile_results["duplicates"]
544 |             report_sections.append("## Duplicate Analysis")
545 |             report_sections.append(f"- **Total rows**: {dup['total_rows']}")
546 |             report_sections.append(f"- **Duplicate rows**: {dup['count']} ({dup['percentage']:.2f}%)")
547 |             report_sections.append(f"- **Should remove**: {dup['should_remove']}")
548 |             report_sections.append(f"- **Analysis**: {dup['analysis']}")
549 |             report_sections.append("")
550 |         
551 |         # Column Descriptions
552 |         if "column_descriptions" in profile_results:
553 |             report_sections.append("## Column Descriptions")
554 |             for col, info in profile_results["column_descriptions"].items():
555 |                 suggested = f" → *{info['suggested_name']}*" if info['suggested_name'] != col else ""
556 |                 report_sections.append(f"- **{col}**{suggested}: {info['description']}")
557 |             report_sections.append("")
558 |         
559 |         # Data Types
560 |         if "data_types" in profile_results:
561 |             report_sections.append("## Data Type Analysis")
562 |             changes_found = False
563 |             for col, info in profile_results["data_types"].items():
564 |                 if info['suggested_type'] != info['current_type']:
565 |                     report_sections.append(f"- **{col}**: {info['current_type']} → *{info['suggested_type']}* ({info['reason']})")
566 |                     changes_found = True
567 |             if not changes_found:
568 |                 report_sections.append("- All data types are appropriate")
569 |             report_sections.append("")
570 |         
571 |         # Missing Values
572 |         if "missing_values" in profile_results:
573 |             report_sections.append("## Missing Values Analysis")
574 |             if "missing_analysis" in profile_results:
575 |                 report_sections.append(f"**Overview**: {profile_results['missing_analysis']}")
576 |                 report_sections.append("")
577 |             
578 |             problematic_missing = []
579 |             meaningful_missing = []
580 |             
581 |             for col, info in profile_results["missing_values"].items():
582 |                 if info['count'] > 0:
583 |                     entry = f"**{col}**: {info['count']} missing ({info['percentage']:.1f}%) - {info['reason']}"
584 |                     if info['is_meaningful']:
585 |                         meaningful_missing.append(entry)
586 |                     else:
587 |                         problematic_missing.append(entry)
588 |             
589 |             if problematic_missing:
590 |                 report_sections.append("### Problematic Missing Values")
591 |                 for entry in problematic_missing:
592 |                     report_sections.append(f"- {entry}")
593 |                 report_sections.append("")
594 |             
595 |             if meaningful_missing:
596 |                 report_sections.append("### Likely Meaningful Missing Values") 
597 |                 for entry in meaningful_missing:
598 |                     report_sections.append(f"- {entry}")
599 |                 report_sections.append("")
600 |         
601 |         # Uniqueness
602 |         if "uniqueness" in profile_results:
603 |             report_sections.append("## Uniqueness Analysis")
604 |             candidate_keys = []
605 |             highly_unique = []
606 |             
607 |             for col, info in profile_results["uniqueness"].items():
608 |                 if info['is_candidate_key']:
609 |                     candidate_keys.append(f"**{col}**: {info['explanation']}")
610 |                 elif info['unique_percentage'] > 50:
611 |                     highly_unique.append(f"**{col}**: {info['unique_percentage']:.1f}% unique")
612 |             
613 |             if candidate_keys:
614 |                 report_sections.append("### Candidate Key Columns")
615 |                 for key in candidate_keys:
616 |                     report_sections.append(f"- {key}")
617 |                 report_sections.append("")
618 |                 
619 |             if highly_unique:
620 |                 report_sections.append("### Highly Unique Columns")
621 |                 for col in highly_unique:
622 |                     report_sections.append(f"- {col}")
623 |                 report_sections.append("")
624 |         
625 |         # Unusual Values
626 |         if "unusual_values" in profile_results:
627 |             report_sections.append("## Unusual Values Detection")
628 |             unusual_found = []
629 |             
630 |             for col, info in profile_results["unusual_values"].items():
631 |                 if info['has_unusual']:
632 |                     unusual_found.append(f"**{col}**: {info['explanation']}")
633 |             
634 |             if unusual_found:
635 |                 for finding in unusual_found:
636 |                     report_sections.append(f"- {finding}")
637 |             else:
638 |                 report_sections.append("- No unusual values detected")
639 |             report_sections.append("")
640 |         
641 |         return "\n".join(report_sections)
642 | 
643 |     def post(self, shared, prep_res, exec_res):
644 |         shared["final_report"] = exec_res
645 |         print("Data profiling complete! Report generated.")
646 |         return "default"


--------------------------------------------------------------------------------
/.clinerules:
--------------------------------------------------------------------------------
   1 | ---
   2 | layout: default
   3 | title: "Agentic Coding"
   4 | ---
   5 | 
   6 | # Agentic Coding: Humans Design, Agents code!
   7 | 
   8 | > If you are an AI agent involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification.
   9 | {: .warning }
  10 | 
  11 | ## Agentic Coding Steps
  12 | 
  13 | Agentic Coding should be a collaboration between Human System Design and Agent Implementation:
  14 | 
  15 | | Steps                  | Human      | AI        | Comment                                                                 |
  16 | |:-----------------------|:----------:|:---------:|:------------------------------------------------------------------------|
  17 | | 1. Requirements | ★★★ High  | ★☆☆ Low   | Humans understand the requirements and context.                    |
  18 | | 2. Flow          | ★★☆ Medium | ★★☆ Medium |  Humans specify the high-level design, and the AI fills in the details. |
  19 | | 3. Utilities   | ★★☆ Medium | ★★☆ Medium | Humans provide available external APIs and integrations, and the AI helps with implementation. |
  20 | | 4. Data          | ★☆☆ Low    | ★★★ High   | AI designs the data schema, and humans verify.                            |
  21 | | 5. Node          | ★☆☆ Low   | ★★★ High  | The AI helps design the node based on the flow.          |
  22 | | 6. Implementation      | ★☆☆ Low   | ★★★ High  |  The AI implements the flow based on the design. |
  23 | | 7. Optimization        | ★★☆ Medium | ★★☆ Medium | Humans evaluate the results, and the AI helps optimize. |
  24 | | 8. Reliability         | ★☆☆ Low   | ★★★ High  |  The AI writes test cases and addresses corner cases.     |
  25 | 
  26 | 1. **Requirements**: Clarify the requirements for your project, and evaluate whether an AI system is a good fit. 
  27 |     - Understand AI systems' strengths and limitations:
  28 |       - **Good for**: Routine tasks requiring common sense (filling forms, replying to emails)
  29 |       - **Good for**: Creative tasks with well-defined inputs (building slides, writing SQL)
  30 |       - **Not good for**: Ambiguous problems requiring complex decision-making (business strategy, startup planning)
  31 |     - **Keep It User-Centric:** Explain the "problem" from the user's perspective rather than just listing features.
  32 |     - **Balance complexity vs. impact**: Aim to deliver the highest value features with minimal complexity early.
  33 | 
  34 | 2. **Flow Design**: Outline at a high level, describe how your AI system orchestrates nodes.
  35 |     - Identify applicable design patterns (e.g., [Map Reduce](./design_pattern/mapreduce.md), [Agent](./design_pattern/agent.md), [RAG](./design_pattern/rag.md)).
  36 |       - For each node in the flow, start with a high-level one-line description of what it does.
  37 |       - If using **Map Reduce**, specify how to map (what to split) and how to reduce (how to combine).
  38 |       - If using **Agent**, specify what are the inputs (context) and what are the possible actions.
  39 |       - If using **RAG**, specify what to embed, noting that there's usually both offline (indexing) and online (retrieval) workflows.
  40 |     - Outline the flow and draw it in a mermaid diagram. For example:
  41 |       ```mermaid
  42 |       flowchart LR
  43 |           start[Start] --> batch[Batch]
  44 |           batch --> check[Check]
  45 |           check -->|OK| process
  46 |           check -->|Error| fix[Fix]
  47 |           fix --> check
  48 |           
  49 |           subgraph process[Process]
  50 |             step1[Step 1] --> step2[Step 2]
  51 |           end
  52 |           
  53 |           process --> endNode[End]
  54 |       ```
  55 |     - > **If Humans can't specify the flow, AI Agents can't automate it!** Before building an LLM system, thoroughly understand the problem and potential solution by manually solving example inputs to develop intuition.  
  56 |       {: .best-practice }
  57 | 
  58 | 3. **Utilities**: Based on the Flow Design, identify and implement necessary utility functions.
  59 |     - Think of your AI system as the brain. It needs a body—these *external utility functions*—to interact with the real world:
  60 |         <div align="center"><img src="https://github.com/the-pocket/.github/raw/main/assets/utility.png?raw=true" width="400"/></div>
  61 | 
  62 |         - Reading inputs (e.g., retrieving Slack messages, reading emails)
  63 |         - Writing outputs (e.g., generating reports, sending emails)
  64 |         - Using external tools (e.g., calling LLMs, searching the web)
  65 |         - **NOTE**: *LLM-based tasks* (e.g., summarizing text, analyzing sentiment) are **NOT** utility functions; rather, they are *core functions* internal in the AI system.
  66 |     - For each utility function, implement it and write a simple test.
  67 |     - Document their input/output, as well as why they are necessary. For example:
  68 |       - `name`: `get_embedding` (`utils/get_embedding.py`)
  69 |       - `input`: `str`
  70 |       - `output`: a vector of 3072 floats
  71 |       - `necessity`: Used by the second node to embed text
  72 |     - Example utility implementation:
  73 |       ```python
  74 |       # utils/call_llm.py
  75 |       from openai import OpenAI
  76 | 
  77 |       def call_llm(prompt):    
  78 |           client = OpenAI(api_key="YOUR_API_KEY_HERE")
  79 |           r = client.chat.completions.create(
  80 |               model="gpt-4o",
  81 |               messages=[{"role": "user", "content": prompt}]
  82 |           )
  83 |           return r.choices[0].message.content
  84 |           
  85 |       if __name__ == "__main__":
  86 |           prompt = "What is the meaning of life?"
  87 |           print(call_llm(prompt))
  88 |       ```
  89 |     - > **Sometimes, design Utilities before Flow:**  For example, for an LLM project to automate a legacy system, the bottleneck will likely be the available interface to that system. Start by designing the hardest utilities for interfacing, and then build the flow around them.
  90 |       {: .best-practice }
  91 |     - > **Avoid Exception Handling in Utilities**: If a utility function is called from a Node's `exec()` method, avoid using `try...except` blocks within the utility. Let the Node's built-in retry mechanism handle failures.
  92 |       {: .warning }
  93 | 
  94 | 4. **Data Design**: Design the shared store that nodes will use to communicate.
  95 |    - One core design principle for PocketFlow is to use a well-designed [shared store](./core_abstraction/communication.md)—a data contract that all nodes agree upon to retrieve and store data.
  96 |       - For simple systems, use an in-memory dictionary.
  97 |       - For more complex systems or when persistence is required, use a database.
  98 |       - **Don't Repeat Yourself**: Use in-memory references or foreign keys.
  99 |       - Example shared store design:
 100 |         ```python
 101 |         shared = {
 102 |             "user": {
 103 |                 "id": "user123",
 104 |                 "context": {                # Another nested dict
 105 |                     "weather": {"temp": 72, "condition": "sunny"},
 106 |                     "location": "San Francisco"
 107 |                 }
 108 |             },
 109 |             "results": {}                   # Empty dict to store outputs
 110 |         }
 111 |         ```
 112 | 
 113 | 5. **Node Design**: Plan how each node will read and write data, and use utility functions.
 114 |    - For each [Node](./core_abstraction/node.md), describe its type, how it reads and writes data, and which utility function it uses. Keep it specific but high-level without codes. For example:
 115 |      - `type`: Regular (or Batch, or Async)
 116 |      - `prep`: Read "text" from the shared store
 117 |      - `exec`: Call the embedding utility function. **Avoid exception handling here**; let the Node's retry mechanism manage failures.
 118 |      - `post`: Write "embedding" to the shared store
 119 | 
 120 | 6. **Implementation**: Implement the initial nodes and flows based on the design.
 121 |    - 🎉 If you've reached this step, humans have finished the design. Now *Agentic Coding* begins!
 122 |    - **"Keep it simple, stupid!"** Avoid complex features and full-scale type checking.
 123 |    - **FAIL FAST**! Leverage the built-in [Node](./core_abstraction/node.md) retry and fallback mechanisms to handle failures gracefully. This helps you quickly identify weak points in the system.
 124 |    - Add logging throughout the code to facilitate debugging.
 125 | 
 126 | 7. **Optimization**:
 127 |    - **Use Intuition**: For a quick initial evaluation, human intuition is often a good start.
 128 |    - **Redesign Flow (Back to Step 3)**: Consider breaking down tasks further, introducing agentic decisions, or better managing input contexts.
 129 |    - If your flow design is already solid, move on to micro-optimizations:
 130 |      - **Prompt Engineering**: Use clear, specific instructions with examples to reduce ambiguity.
 131 |      - **In-Context Learning**: Provide robust examples for tasks that are difficult to specify with instructions alone.
 132 | 
 133 |    - > **You'll likely iterate a lot!** Expect to repeat Steps 3–6 hundreds of times.
 134 |      >
 135 |      > <div align="center"><img src="https://github.com/the-pocket/.github/raw/main/assets/success.png?raw=true" width="400"/></div>
 136 |      {: .best-practice }
 137 | 
 138 | 8. **Reliability**  
 139 |    - **Node Retries**: Add checks in the node `exec` to ensure outputs meet requirements, and consider increasing `max_retries` and `wait` times.
 140 |    - **Logging and Visualization**: Maintain logs of all attempts and visualize node results for easier debugging.
 141 |    - **Self-Evaluation**: Add a separate node (powered by an LLM) to review outputs when results are uncertain.
 142 | 
 143 | ## Example LLM Project File Structure
 144 | 
 145 | ```
 146 | my_project/
 147 | ├── main.py
 148 | ├── nodes.py
 149 | ├── flow.py
 150 | ├── utils/
 151 | │   ├── __init__.py
 152 | │   ├── call_llm.py
 153 | │   └── search_web.py
 154 | ├── requirements.txt
 155 | └── docs/
 156 |     └── design.md
 157 | ```
 158 | 
 159 | - **`requirements.txt`**: Lists the Python dependencies for the project.
 160 |   ```
 161 |   PyYAML
 162 |   pocketflow
 163 |   ```
 164 | 
 165 | - **`docs/design.md`**: Contains project documentation for each step above. This should be *high-level* and *no-code*.
 166 |   ~~~
 167 |   # Design Doc: Your Project Name
 168 | 
 169 |   > Please DON'T remove notes for AI
 170 | 
 171 |   ## Requirements
 172 | 
 173 |   > Notes for AI: Keep it simple and clear.
 174 |   > If the requirements are abstract, write concrete user stories
 175 | 
 176 | 
 177 |   ## Flow Design
 178 | 
 179 |   > Notes for AI:
 180 |   > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit.
 181 |   > 2. Present a concise, high-level description of the workflow.
 182 | 
 183 |   ### Applicable Design Pattern:
 184 | 
 185 |   1. Map the file summary into chunks, then reduce these chunks into a final summary.
 186 |   2. Agentic file finder
 187 |     - *Context*: The entire summary of the file
 188 |     - *Action*: Find the file
 189 | 
 190 |   ### Flow high-level Design:
 191 | 
 192 |   1. **First Node**: This node is for ...
 193 |   2. **Second Node**: This node is for ...
 194 |   3. **Third Node**: This node is for ...
 195 | 
 196 |   ```mermaid
 197 |   flowchart TD
 198 |       firstNode[First Node] --> secondNode[Second Node]
 199 |       secondNode --> thirdNode[Third Node]
 200 |   ```
 201 |   ## Utility Functions
 202 | 
 203 |   > Notes for AI:
 204 |   > 1. Understand the utility function definition thoroughly by reviewing the doc.
 205 |   > 2. Include only the necessary utility functions, based on nodes in the flow.
 206 | 
 207 |   1. **Call LLM** (`utils/call_llm.py`)
 208 |     - *Input*: prompt (str)
 209 |     - *Output*: response (str)
 210 |     - Generally used by most nodes for LLM tasks
 211 | 
 212 |   2. **Embedding** (`utils/get_embedding.py`)
 213 |     - *Input*: str
 214 |     - *Output*: a vector of 3072 floats
 215 |     - Used by the second node to embed text
 216 | 
 217 |   ## Node Design
 218 | 
 219 |   ### Shared Store
 220 | 
 221 |   > Notes for AI: Try to minimize data redundancy
 222 | 
 223 |   The shared store structure is organized as follows:
 224 | 
 225 |   ```python
 226 |   shared = {
 227 |       "key": "value"
 228 |   }
 229 |   ```
 230 | 
 231 |   ### Node Steps
 232 | 
 233 |   > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow.
 234 | 
 235 |   1. First Node
 236 |     - *Purpose*: Provide a short explanation of the node’s function
 237 |     - *Type*: Decide between Regular, Batch, or Async
 238 |     - *Steps*:
 239 |       - *prep*: Read "key" from the shared store
 240 |       - *exec*: Call the utility function
 241 |       - *post*: Write "key" to the shared store
 242 | 
 243 |   2. Second Node
 244 |     ...
 245 |   ~~~
 246 | 
 247 | 
 248 | - **`utils/`**: Contains all utility functions.
 249 |   - It's recommended to dedicate one Python file to each API call, for example `call_llm.py` or `search_web.py`.
 250 |   - Each file should also include a `main()` function to try that API call
 251 |   ```python
 252 |   from google import genai
 253 |   import os
 254 | 
 255 |   def call_llm(prompt: str) -> str:
 256 |       client = genai.Client(
 257 |           api_key=os.getenv("GEMINI_API_KEY", ""),
 258 |       )
 259 |       model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
 260 |       response = client.models.generate_content(model=model, contents=[prompt])
 261 |       return response.text
 262 | 
 263 |   if __name__ == "__main__":
 264 |       test_prompt = "Hello, how are you?"
 265 | 
 266 |       # First call - should hit the API
 267 |       print("Making call...")
 268 |       response1 = call_llm(test_prompt, use_cache=False)
 269 |       print(f"Response: {response1}")
 270 |   ```
 271 | 
 272 | - **`nodes.py`**: Contains all the node definitions.
 273 |   ```python
 274 |   # nodes.py
 275 |   from pocketflow import Node
 276 |   from utils.call_llm import call_llm
 277 | 
 278 |   class GetQuestionNode(Node):
 279 |       def exec(self, _):
 280 |           # Get question directly from user input
 281 |           user_question = input("Enter your question: ")
 282 |           return user_question
 283 |       
 284 |       def post(self, shared, prep_res, exec_res):
 285 |           # Store the user's question
 286 |           shared["question"] = exec_res
 287 |           return "default"  # Go to the next node
 288 | 
 289 |   class AnswerNode(Node):
 290 |       def prep(self, shared):
 291 |           # Read question from shared
 292 |           return shared["question"]
 293 |       
 294 |       def exec(self, question):
 295 |           # Call LLM to get the answer
 296 |           return call_llm(question)
 297 |       
 298 |       def post(self, shared, prep_res, exec_res):
 299 |           # Store the answer in shared
 300 |           shared["answer"] = exec_res
 301 |   ```
 302 | - **`flow.py`**: Implements functions that create flows by importing node definitions and connecting them.
 303 |   ```python
 304 |   # flow.py
 305 |   from pocketflow import Flow
 306 |   from nodes import GetQuestionNode, AnswerNode
 307 | 
 308 |   def create_qa_flow():
 309 |       """Create and return a question-answering flow."""
 310 |       # Create nodes
 311 |       get_question_node = GetQuestionNode()
 312 |       answer_node = AnswerNode()
 313 |       
 314 |       # Connect nodes in sequence
 315 |       get_question_node >> answer_node
 316 |       
 317 |       # Create flow starting with input node
 318 |       return Flow(start=get_question_node)
 319 |   ```
 320 | - **`main.py`**: Serves as the project's entry point.
 321 |   ```python
 322 |   # main.py
 323 |   from flow import create_qa_flow
 324 | 
 325 |   # Example main function
 326 |   # Please replace this with your own main function
 327 |   def main():
 328 |       shared = {
 329 |           "question": None,  # Will be populated by GetQuestionNode from user input
 330 |           "answer": None     # Will be populated by AnswerNode
 331 |       }
 332 | 
 333 |       # Create the flow and run it
 334 |       qa_flow = create_qa_flow()
 335 |       qa_flow.run(shared)
 336 |       print(f"Question: {shared['question']}")
 337 |       print(f"Answer: {shared['answer']}")
 338 | 
 339 |   if __name__ == "__main__":
 340 |       main()
 341 |   ```
 342 | 
 343 | ================================================
 344 | File: docs/index.md
 345 | ================================================
 346 | ---
 347 | layout: default
 348 | title: "Home"
 349 | nav_order: 1
 350 | ---
 351 | 
 352 | # Pocket Flow
 353 | 
 354 | A [100-line](https://github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*.
 355 | 
 356 | - **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in.
 357 | - **Expressive**: Everything you love from larger frameworks—([Multi-](./design_pattern/multi_agent.html))[Agents](./design_pattern/agent.html), [Workflow](./design_pattern/workflow.html), [RAG](./design_pattern/rag.html), and more.  
 358 | - **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications.
 359 | 
 360 | <div align="center">
 361 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/meme.jpg?raw=true" width="400"/>
 362 | </div>
 363 | 
 364 | ## Core Abstraction
 365 | 
 366 | We model the LLM workflow as a **Graph + Shared Store**:
 367 | 
 368 | - [Node](./core_abstraction/node.md) handles simple (LLM) tasks.
 369 | - [Flow](./core_abstraction/flow.md) connects nodes through **Actions** (labeled edges).
 370 | - [Shared Store](./core_abstraction/communication.md) enables communication between nodes within flows.
 371 | - [Batch](./core_abstraction/batch.md) nodes/flows allow for data-intensive tasks.
 372 | - [Async](./core_abstraction/async.md) nodes/flows allow waiting for asynchronous tasks.
 373 | - [(Advanced) Parallel](./core_abstraction/parallel.md) nodes/flows handle I/O-bound tasks.
 374 | 
 375 | <div align="center">
 376 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/abstraction.png" width="500"/>
 377 | </div>
 378 | 
 379 | ## Design Pattern
 380 | 
 381 | From there, it’s easy to implement popular design patterns:
 382 | 
 383 | - [Agent](./design_pattern/agent.md) autonomously makes decisions.
 384 | - [Workflow](./design_pattern/workflow.md) chains multiple tasks into pipelines.
 385 | - [RAG](./design_pattern/rag.md) integrates data retrieval with generation.
 386 | - [Map Reduce](./design_pattern/mapreduce.md) splits data tasks into Map and Reduce steps.
 387 | - [Structured Output](./design_pattern/structure.md) formats outputs consistently.
 388 | - [(Advanced) Multi-Agents](./design_pattern/multi_agent.md) coordinate multiple agents.
 389 | 
 390 | <div align="center">
 391 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/design.png" width="500"/>
 392 | </div>
 393 | 
 394 | ## Utility Function
 395 | 
 396 | We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*:
 397 | 
 398 | - [LLM Wrapper](./utility_function/llm.md)
 399 | - [Viz and Debug](./utility_function/viz.md)
 400 | - [Web Search](./utility_function/websearch.md)
 401 | - [Chunking](./utility_function/chunking.md)
 402 | - [Embedding](./utility_function/embedding.md)
 403 | - [Vector Databases](./utility_function/vector.md)
 404 | - [Text-to-Speech](./utility_function/text_to_speech.md)
 405 | 
 406 | **Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework:
 407 | - *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs.
 408 | - *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally.
 409 | - *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in.
 410 | 
 411 | ## Ready to build your Apps? 
 412 | 
 413 | Check out [Agentic Coding Guidance](./guide.md), the fastest way to develop LLM projects with Pocket Flow!
 414 | 
 415 | ================================================
 416 | File: docs/core_abstraction/async.md
 417 | ================================================
 418 | ---
 419 | layout: default
 420 | title: "(Advanced) Async"
 421 | parent: "Core Abstraction"
 422 | nav_order: 5
 423 | ---
 424 | 
 425 | # (Advanced) Async
 426 | 
 427 | **Async** Nodes implement `prep_async()`, `exec_async()`, `exec_fallback_async()`, and/or `post_async()`. This is useful for:
 428 | 
 429 | 1. **prep_async()**: For *fetching/reading data (files, APIs, DB)* in an I/O-friendly way.
 430 | 2. **exec_async()**: Typically used for async LLM calls.
 431 | 3. **post_async()**: For *awaiting user feedback*, *coordinating across multi-agents* or any additional async steps after `exec_async()`.
 432 | 
 433 | **Note**: `AsyncNode` must be wrapped in `AsyncFlow`. `AsyncFlow` can also include regular (sync) nodes.
 434 | 
 435 | ### Example
 436 | 
 437 | ```python
 438 | class SummarizeThenVerify(AsyncNode):
 439 |     async def prep_async(self, shared):
 440 |         # Example: read a file asynchronously
 441 |         doc_text = await read_file_async(shared["doc_path"])
 442 |         return doc_text
 443 | 
 444 |     async def exec_async(self, prep_res):
 445 |         # Example: async LLM call
 446 |         summary = await call_llm_async(f"Summarize: {prep_res}")
 447 |         return summary
 448 | 
 449 |     async def post_async(self, shared, prep_res, exec_res):
 450 |         # Example: wait for user feedback
 451 |         decision = await gather_user_feedback(exec_res)
 452 |         if decision == "approve":
 453 |             shared["summary"] = exec_res
 454 |             return "approve"
 455 |         return "deny"
 456 | 
 457 | summarize_node = SummarizeThenVerify()
 458 | final_node = Finalize()
 459 | 
 460 | # Define transitions
 461 | summarize_node - "approve" >> final_node
 462 | summarize_node - "deny"    >> summarize_node  # retry
 463 | 
 464 | flow = AsyncFlow(start=summarize_node)
 465 | 
 466 | async def main():
 467 |     shared = {"doc_path": "document.txt"}
 468 |     await flow.run_async(shared)
 469 |     print("Final Summary:", shared.get("summary"))
 470 | 
 471 | asyncio.run(main())
 472 | ```
 473 | 
 474 | ================================================
 475 | File: docs/core_abstraction/batch.md
 476 | ================================================
 477 | ---
 478 | layout: default
 479 | title: "Batch"
 480 | parent: "Core Abstraction"
 481 | nav_order: 4
 482 | ---
 483 | 
 484 | # Batch
 485 | 
 486 | **Batch** makes it easier to handle large inputs in one Node or **rerun** a Flow multiple times. Example use cases:
 487 | - **Chunk-based** processing (e.g., splitting large texts).
 488 | - **Iterative** processing over lists of input items (e.g., user queries, files, URLs).
 489 | 
 490 | ## 1. BatchNode
 491 | 
 492 | A **BatchNode** extends `Node` but changes `prep()` and `exec()`:
 493 | 
 494 | - **`prep(shared)`**: returns an **iterable** (e.g., list, generator).
 495 | - **`exec(item)`**: called **once** per item in that iterable.
 496 | - **`post(shared, prep_res, exec_res_list)`**: after all items are processed, receives a **list** of results (`exec_res_list`) and returns an **Action**.
 497 | 
 498 | 
 499 | ### Example: Summarize a Large File
 500 | 
 501 | ```python
 502 | class MapSummaries(BatchNode):
 503 |     def prep(self, shared):
 504 |         # Suppose we have a big file; chunk it
 505 |         content = shared["data"]
 506 |         chunk_size = 10000
 507 |         chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
 508 |         return chunks
 509 | 
 510 |     def exec(self, chunk):
 511 |         prompt = f"Summarize this chunk in 10 words: {chunk}"
 512 |         summary = call_llm(prompt)
 513 |         return summary
 514 | 
 515 |     def post(self, shared, prep_res, exec_res_list):
 516 |         combined = "\n".join(exec_res_list)
 517 |         shared["summary"] = combined
 518 |         return "default"
 519 | 
 520 | map_summaries = MapSummaries()
 521 | flow = Flow(start=map_summaries)
 522 | flow.run(shared)
 523 | ```
 524 | 
 525 | ---
 526 | 
 527 | ## 2. BatchFlow
 528 | 
 529 | A **BatchFlow** runs a **Flow** multiple times, each time with different `params`. Think of it as a loop that replays the Flow for each parameter set.
 530 | 
 531 | ### Example: Summarize Many Files
 532 | 
 533 | ```python
 534 | class SummarizeAllFiles(BatchFlow):
 535 |     def prep(self, shared):
 536 |         # Return a list of param dicts (one per file)
 537 |         filenames = list(shared["data"].keys())  # e.g., ["file1.txt", "file2.txt", ...]
 538 |         return [{"filename": fn} for fn in filenames]
 539 | 
 540 | # Suppose we have a per-file Flow (e.g., load_file >> summarize >> reduce):
 541 | summarize_file = SummarizeFile(start=load_file)
 542 | 
 543 | # Wrap that flow into a BatchFlow:
 544 | summarize_all_files = SummarizeAllFiles(start=summarize_file)
 545 | summarize_all_files.run(shared)
 546 | ```
 547 | 
 548 | ### Under the Hood
 549 | 1. `prep(shared)` returns a list of param dicts—e.g., `[{filename: "file1.txt"}, {filename: "file2.txt"}, ...]`.
 550 | 2. The **BatchFlow** loops through each dict. For each one:
 551 |    - It merges the dict with the BatchFlow’s own `params`.
 552 |    - It calls `flow.run(shared)` using the merged result.
 553 | 3. This means the sub-Flow is run **repeatedly**, once for every param dict.
 554 | 
 555 | ---
 556 | 
 557 | ## 3. Nested or Multi-Level Batches
 558 | 
 559 | You can nest a **BatchFlow** in another **BatchFlow**. For instance:
 560 | - **Outer** batch: returns a list of diretory param dicts (e.g., `{"directory": "/pathA"}`, `{"directory": "/pathB"}`, ...).
 561 | - **Inner** batch: returning a list of per-file param dicts.
 562 | 
 563 | At each level, **BatchFlow** merges its own param dict with the parent’s. By the time you reach the **innermost** node, the final `params` is the merged result of **all** parents in the chain. This way, a nested structure can keep track of the entire context (e.g., directory + file name) at once.
 564 | 
 565 | ```python
 566 | 
 567 | class FileBatchFlow(BatchFlow):
 568 |     def prep(self, shared):
 569 |         directory = self.params["directory"]
 570 |         # e.g., files = ["file1.txt", "file2.txt", ...]
 571 |         files = [f for f in os.listdir(directory) if f.endswith(".txt")]
 572 |         return [{"filename": f} for f in files]
 573 | 
 574 | class DirectoryBatchFlow(BatchFlow):
 575 |     def prep(self, shared):
 576 |         directories = [ "/path/to/dirA", "/path/to/dirB"]
 577 |         return [{"directory": d} for d in directories]
 578 | 
 579 | # MapSummaries have params like {"directory": "/path/to/dirA", "filename": "file1.txt"}
 580 | inner_flow = FileBatchFlow(start=MapSummaries())
 581 | outer_flow = DirectoryBatchFlow(start=inner_flow)
 582 | ```
 583 | 
 584 | ================================================
 585 | File: docs/core_abstraction/communication.md
 586 | ================================================
 587 | ---
 588 | layout: default
 589 | title: "Communication"
 590 | parent: "Core Abstraction"
 591 | nav_order: 3
 592 | ---
 593 | 
 594 | # Communication
 595 | 
 596 | Nodes and Flows **communicate** in 2 ways:
 597 | 
 598 | 1. **Shared Store (for almost all the cases)** 
 599 | 
 600 |    - A global data structure (often an in-mem dict) that all nodes can read ( `prep()`) and write (`post()`).  
 601 |    - Great for data results, large content, or anything multiple nodes need.
 602 |    - You shall design the data structure and populate it ahead.
 603 |      
 604 |    - > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*!  This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](./batch.md).
 605 |      {: .best-practice }
 606 | 
 607 | 2. **Params (only for [Batch](./batch.md))** 
 608 |    - Each node has a local, ephemeral `params` dict passed in by the **parent Flow**, used as an identifier for tasks. Parameter keys and values shall be **immutable**.
 609 |    - Good for identifiers like filenames or numeric IDs, in Batch mode.
 610 | 
 611 | If you know memory management, think of the **Shared Store** like a **heap** (shared by all function calls), and **Params** like a **stack** (assigned by the caller).
 612 | 
 613 | ---
 614 | 
 615 | ## 1. Shared Store
 616 | 
 617 | ### Overview
 618 | 
 619 | A shared store is typically an in-mem dictionary, like:
 620 | ```python
 621 | shared = {"data": {}, "summary": {}, "config": {...}, ...}
 622 | ```
 623 | 
 624 | It can also contain local file handlers, DB connections, or a combination for persistence. We recommend deciding the data structure or DB schema first based on your app requirements.
 625 | 
 626 | ### Example
 627 | 
 628 | ```python
 629 | class LoadData(Node):
 630 |     def post(self, shared, prep_res, exec_res):
 631 |         # We write data to shared store
 632 |         shared["data"] = "Some text content"
 633 |         return None
 634 | 
 635 | class Summarize(Node):
 636 |     def prep(self, shared):
 637 |         # We read data from shared store
 638 |         return shared["data"]
 639 | 
 640 |     def exec(self, prep_res):
 641 |         # Call LLM to summarize
 642 |         prompt = f"Summarize: {prep_res}"
 643 |         summary = call_llm(prompt)
 644 |         return summary
 645 | 
 646 |     def post(self, shared, prep_res, exec_res):
 647 |         # We write summary to shared store
 648 |         shared["summary"] = exec_res
 649 |         return "default"
 650 | 
 651 | load_data = LoadData()
 652 | summarize = Summarize()
 653 | load_data >> summarize
 654 | flow = Flow(start=load_data)
 655 | 
 656 | shared = {}
 657 | flow.run(shared)
 658 | ```
 659 | 
 660 | Here:
 661 | - `LoadData` writes to `shared["data"]`.
 662 | - `Summarize` reads from `shared["data"]`, summarizes, and writes to `shared["summary"]`.
 663 | 
 664 | ---
 665 | 
 666 | ## 2. Params
 667 | 
 668 | **Params** let you store *per-Node* or *per-Flow* config that doesn't need to live in the shared store. They are:
 669 | - **Immutable** during a Node's run cycle (i.e., they don't change mid-`prep->exec->post`).
 670 | - **Set** via `set_params()`.
 671 | - **Cleared** and updated each time a parent Flow calls it.
 672 | 
 673 | > Only set the uppermost Flow params because others will be overwritten by the parent Flow. 
 674 | > 
 675 | > If you need to set child node params, see [Batch](./batch.md).
 676 | {: .warning }
 677 | 
 678 | Typically, **Params** are identifiers (e.g., file name, page number). Use them to fetch the task you assigned or write to a specific part of the shared store.
 679 | 
 680 | ### Example
 681 | 
 682 | ```python
 683 | # 1) Create a Node that uses params
 684 | class SummarizeFile(Node):
 685 |     def prep(self, shared):
 686 |         # Access the node's param
 687 |         filename = self.params["filename"]
 688 |         return shared["data"].get(filename, "")
 689 | 
 690 |     def exec(self, prep_res):
 691 |         prompt = f"Summarize: {prep_res}"
 692 |         return call_llm(prompt)
 693 | 
 694 |     def post(self, shared, prep_res, exec_res):
 695 |         filename = self.params["filename"]
 696 |         shared["summary"][filename] = exec_res
 697 |         return "default"
 698 | 
 699 | # 2) Set params
 700 | node = SummarizeFile()
 701 | 
 702 | # 3) Set Node params directly (for testing)
 703 | node.set_params({"filename": "doc1.txt"})
 704 | node.run(shared)
 705 | 
 706 | # 4) Create Flow
 707 | flow = Flow(start=node)
 708 | 
 709 | # 5) Set Flow params (overwrites node params)
 710 | flow.set_params({"filename": "doc2.txt"})
 711 | flow.run(shared)  # The node summarizes doc2, not doc1
 712 | ```
 713 | 
 714 | ================================================
 715 | File: docs/core_abstraction/flow.md
 716 | ================================================
 717 | ---
 718 | layout: default
 719 | title: "Flow"
 720 | parent: "Core Abstraction"
 721 | nav_order: 2
 722 | ---
 723 | 
 724 | # Flow
 725 | 
 726 | A **Flow** orchestrates a graph of Nodes. You can chain Nodes in a sequence or create branching depending on the **Actions** returned from each Node's `post()`.
 727 | 
 728 | ## 1. Action-based Transitions
 729 | 
 730 | Each Node's `post()` returns an **Action** string. By default, if `post()` doesn't return anything, we treat that as `"default"`.
 731 | 
 732 | You define transitions with the syntax:
 733 | 
 734 | 1. **Basic default transition**: `node_a >> node_b`
 735 |   This means if `node_a.post()` returns `"default"`, go to `node_b`. 
 736 |   (Equivalent to `node_a - "default" >> node_b`)
 737 | 
 738 | 2. **Named action transition**: `node_a - "action_name" >> node_b`
 739 |   This means if `node_a.post()` returns `"action_name"`, go to `node_b`.
 740 | 
 741 | It's possible to create loops, branching, or multi-step flows.
 742 | 
 743 | ## 2. Creating a Flow
 744 | 
 745 | A **Flow** begins with a **start** node. You call `Flow(start=some_node)` to specify the entry point. When you call `flow.run(shared)`, it executes the start node, looks at its returned Action from `post()`, follows the transition, and continues until there's no next node.
 746 | 
 747 | ### Example: Simple Sequence
 748 | 
 749 | Here's a minimal flow of two nodes in a chain:
 750 | 
 751 | ```python
 752 | node_a >> node_b
 753 | flow = Flow(start=node_a)
 754 | flow.run(shared)
 755 | ```
 756 | 
 757 | - When you run the flow, it executes `node_a`.  
 758 | - Suppose `node_a.post()` returns `"default"`.  
 759 | - The flow then sees `"default"` Action is linked to `node_b` and runs `node_b`.  
 760 | - `node_b.post()` returns `"default"` but we didn't define `node_b >> something_else`. So the flow ends there.
 761 | 
 762 | ### Example: Branching & Looping
 763 | 
 764 | Here's a simple expense approval flow that demonstrates branching and looping. The `ReviewExpense` node can return three possible Actions:
 765 | 
 766 | - `"approved"`: expense is approved, move to payment processing
 767 | - `"needs_revision"`: expense needs changes, send back for revision 
 768 | - `"rejected"`: expense is denied, finish the process
 769 | 
 770 | We can wire them like this:
 771 | 
 772 | ```python
 773 | # Define the flow connections
 774 | review - "approved" >> payment        # If approved, process payment
 775 | review - "needs_revision" >> revise   # If needs changes, go to revision
 776 | review - "rejected" >> finish         # If rejected, finish the process
 777 | 
 778 | revise >> review   # After revision, go back for another review
 779 | payment >> finish  # After payment, finish the process
 780 | 
 781 | flow = Flow(start=review)
 782 | ```
 783 | 
 784 | Let's see how it flows:
 785 | 
 786 | 1. If `review.post()` returns `"approved"`, the expense moves to the `payment` node
 787 | 2. If `review.post()` returns `"needs_revision"`, it goes to the `revise` node, which then loops back to `review`
 788 | 3. If `review.post()` returns `"rejected"`, it moves to the `finish` node and stops
 789 | 
 790 | ```mermaid
 791 | flowchart TD
 792 |     review[Review Expense] -->|approved| payment[Process Payment]
 793 |     review -->|needs_revision| revise[Revise Report]
 794 |     review -->|rejected| finish[Finish Process]
 795 | 
 796 |     revise --> review
 797 |     payment --> finish
 798 | ```
 799 | 
 800 | ### Running Individual Nodes vs. Running a Flow
 801 | 
 802 | - `node.run(shared)`: Just runs that node alone (calls `prep->exec->post()`), returns an Action. 
 803 | - `flow.run(shared)`: Executes from the start node, follows Actions to the next node, and so on until the flow can't continue.
 804 | 
 805 | > `node.run(shared)` **does not** proceed to the successor.
 806 | > This is mainly for debugging or testing a single node.
 807 | > 
 808 | > Always use `flow.run(...)` in production to ensure the full pipeline runs correctly.
 809 | {: .warning }
 810 | 
 811 | ## 3. Nested Flows
 812 | 
 813 | A **Flow** can act like a Node, which enables powerful composition patterns. This means you can:
 814 | 
 815 | 1. Use a Flow as a Node within another Flow's transitions.  
 816 | 2. Combine multiple smaller Flows into a larger Flow for reuse.  
 817 | 3. Node `params` will be a merging of **all** parents' `params`.
 818 | 
 819 | ### Flow's Node Methods
 820 | 
 821 | A **Flow** is also a **Node**, so it will run `prep()` and `post()`. However:
 822 | 
 823 | - It **won't** run `exec()`, as its main logic is to orchestrate its nodes.
 824 | - `post()` always receives `None` for `exec_res` and should instead get the flow execution results from the shared store.
 825 | 
 826 | ### Basic Flow Nesting
 827 | 
 828 | Here's how to connect a flow to another node:
 829 | 
 830 | ```python
 831 | # Create a sub-flow
 832 | node_a >> node_b
 833 | subflow = Flow(start=node_a)
 834 | 
 835 | # Connect it to another node
 836 | subflow >> node_c
 837 | 
 838 | # Create the parent flow
 839 | parent_flow = Flow(start=subflow)
 840 | ```
 841 | 
 842 | When `parent_flow.run()` executes:
 843 | 1. It starts `subflow`
 844 | 2. `subflow` runs through its nodes (`node_a->node_b`)
 845 | 3. After `subflow` completes, execution continues to `node_c`
 846 | 
 847 | ### Example: Order Processing Pipeline
 848 | 
 849 | Here's a practical example that breaks down order processing into nested flows:
 850 | 
 851 | ```python
 852 | # Payment processing sub-flow
 853 | validate_payment >> process_payment >> payment_confirmation
 854 | payment_flow = Flow(start=validate_payment)
 855 | 
 856 | # Inventory sub-flow
 857 | check_stock >> reserve_items >> update_inventory
 858 | inventory_flow = Flow(start=check_stock)
 859 | 
 860 | # Shipping sub-flow
 861 | create_label >> assign_carrier >> schedule_pickup
 862 | shipping_flow = Flow(start=create_label)
 863 | 
 864 | # Connect the flows into a main order pipeline
 865 | payment_flow >> inventory_flow >> shipping_flow
 866 | 
 867 | # Create the master flow
 868 | order_pipeline = Flow(start=payment_flow)
 869 | 
 870 | # Run the entire pipeline
 871 | order_pipeline.run(shared_data)
 872 | ```
 873 | 
 874 | This creates a clean separation of concerns while maintaining a clear execution path:
 875 | 
 876 | ```mermaid
 877 | flowchart LR
 878 |     subgraph order_pipeline[Order Pipeline]
 879 |         subgraph paymentFlow["Payment Flow"]
 880 |             A[Validate Payment] --> B[Process Payment] --> C[Payment Confirmation]
 881 |         end
 882 | 
 883 |         subgraph inventoryFlow["Inventory Flow"]
 884 |             D[Check Stock] --> E[Reserve Items] --> F[Update Inventory]
 885 |         end
 886 | 
 887 |         subgraph shippingFlow["Shipping Flow"]
 888 |             G[Create Label] --> H[Assign Carrier] --> I[Schedule Pickup]
 889 |         end
 890 | 
 891 |         paymentFlow --> inventoryFlow
 892 |         inventoryFlow --> shippingFlow
 893 |     end
 894 | ```
 895 | 
 896 | ================================================
 897 | File: docs/core_abstraction/node.md
 898 | ================================================
 899 | ---
 900 | layout: default
 901 | title: "Node"
 902 | parent: "Core Abstraction"
 903 | nav_order: 1
 904 | ---
 905 | 
 906 | # Node
 907 | 
 908 | A **Node** is the smallest building block. Each Node has 3 steps `prep->exec->post`:
 909 | 
 910 | <div align="center">
 911 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/node.png?raw=true" width="400"/>
 912 | </div>
 913 | 
 914 | 1. `prep(shared)`
 915 |    - **Read and preprocess data** from `shared` store. 
 916 |    - Examples: *query DB, read files, or serialize data into a string*.
 917 |    - Return `prep_res`, which is used by `exec()` and `post()`.
 918 | 
 919 | 2. `exec(prep_res)`
 920 |    - **Execute compute logic**, with optional retries and error handling (below).
 921 |    - Examples: *(mostly) LLM calls, remote APIs, tool use*.
 922 |    - ⚠️ This shall be only for compute and **NOT** access `shared`.
 923 |    - ⚠️ If retries enabled, ensure idempotent implementation.
 924 |    - ⚠️ Defer exception handling to the Node's built-in retry mechanism.
 925 |    - Return `exec_res`, which is passed to `post()`.
 926 | 
 927 | 3. `post(shared, prep_res, exec_res)`
 928 |    - **Postprocess and write data** back to `shared`.
 929 |    - Examples: *update DB, change states, log results*.
 930 |    - **Decide the next action** by returning a *string* (`action = "default"` if *None*).
 931 | 
 932 | > **Why 3 steps?** To enforce the principle of *separation of concerns*. The data storage and data processing are operated separately.
 933 | >
 934 | > All steps are *optional*. E.g., you can only implement `prep` and `post` if you just need to process data.
 935 | {: .note }
 936 | 
 937 | ### Fault Tolerance & Retries
 938 | 
 939 | You can **retry** `exec()` if it raises an exception via two parameters when define the Node:
 940 | 
 941 | - `max_retries` (int): Max times to run `exec()`. The default is `1` (**no** retry).
 942 | - `wait` (int): The time to wait (in **seconds**) before next retry. By default, `wait=0` (no waiting). 
 943 | `wait` is helpful when you encounter rate-limits or quota errors from your LLM provider and need to back off.
 944 | 
 945 | ```python 
 946 | my_node = SummarizeFile(max_retries=3, wait=10)
 947 | ```
 948 | 
 949 | When an exception occurs in `exec()`, the Node automatically retries until:
 950 | 
 951 | - It either succeeds, or
 952 | - The Node has retried `max_retries - 1` times already and fails on the last attempt.
 953 | 
 954 | You can get the current retry times (0-based) from `self.cur_retry`.
 955 | 
 956 | ```python 
 957 | class RetryNode(Node):
 958 |     def exec(self, prep_res):
 959 |         print(f"Retry {self.cur_retry} times")
 960 |         raise Exception("Failed")
 961 | ```
 962 | 
 963 | ### Graceful Fallback
 964 | 
 965 | To **gracefully handle** the exception (after all retries) rather than raising it, override:
 966 | 
 967 | ```python 
 968 | def exec_fallback(self, prep_res, exc):
 969 |     raise exc
 970 | ```
 971 | 
 972 | By default, it just re-raises exception. But you can return a fallback result instead, which becomes the `exec_res` passed to `post()`.
 973 | 
 974 | ### Example: Summarize file
 975 | 
 976 | ```python 
 977 | class SummarizeFile(Node):
 978 |     def prep(self, shared):
 979 |         return shared["data"]
 980 | 
 981 |     def exec(self, prep_res):
 982 |         if not prep_res:
 983 |             return "Empty file content"
 984 |         prompt = f"Summarize this text in 10 words: {prep_res}"
 985 |         summary = call_llm(prompt)  # might fail
 986 |         return summary
 987 | 
 988 |     def exec_fallback(self, prep_res, exc):
 989 |         # Provide a simple fallback instead of crashing
 990 |         return "There was an error processing your request."
 991 | 
 992 |     def post(self, shared, prep_res, exec_res):
 993 |         shared["summary"] = exec_res
 994 |         # Return "default" by not returning
 995 | 
 996 | summarize_node = SummarizeFile(max_retries=3)
 997 | 
 998 | # node.run() calls prep->exec->post
 999 | # If exec() fails, it retries up to 3 times before calling exec_fallback()
1000 | action_result = summarize_node.run(shared)
1001 | 
1002 | print("Action returned:", action_result)  # "default"
1003 | print("Summary stored:", shared["summary"])
1004 | ```
1005 | 
1006 | ================================================
1007 | File: docs/core_abstraction/parallel.md
1008 | ================================================
1009 | ---
1010 | layout: default
1011 | title: "(Advanced) Parallel"
1012 | parent: "Core Abstraction"
1013 | nav_order: 6
1014 | ---
1015 | 
1016 | # (Advanced) Parallel
1017 | 
1018 | **Parallel** Nodes and Flows let you run multiple **Async** Nodes and Flows  **concurrently**—for example, summarizing multiple texts at once. This can improve performance by overlapping I/O and compute. 
1019 | 
1020 | > Because of Python’s GIL, parallel nodes and flows can’t truly parallelize CPU-bound tasks (e.g., heavy numerical computations). However, they excel at overlapping I/O-bound work—like LLM calls, database queries, API requests, or file I/O.
1021 | {: .warning }
1022 | 
1023 | > - **Ensure Tasks Are Independent**: If each item depends on the output of a previous item, **do not** parallelize.
1024 | > 
1025 | > - **Beware of Rate Limits**: Parallel calls can **quickly** trigger rate limits on LLM services. You may need a **throttling** mechanism (e.g., semaphores or sleep intervals).
1026 | > 
1027 | > - **Consider Single-Node Batch APIs**: Some LLMs offer a **batch inference** API where you can send multiple prompts in a single call. This is more complex to implement but can be more efficient than launching many parallel requests and mitigates rate limits.
1028 | {: .best-practice }
1029 | 
1030 | ## AsyncParallelBatchNode
1031 | 
1032 | Like **AsyncBatchNode**, but run `exec_async()` in **parallel**:
1033 | 
1034 | ```python
1035 | class ParallelSummaries(AsyncParallelBatchNode):
1036 |     async def prep_async(self, shared):
1037 |         # e.g., multiple texts
1038 |         return shared["texts"]
1039 | 
1040 |     async def exec_async(self, text):
1041 |         prompt = f"Summarize: {text}"
1042 |         return await call_llm_async(prompt)
1043 | 
1044 |     async def post_async(self, shared, prep_res, exec_res_list):
1045 |         shared["summary"] = "\n\n".join(exec_res_list)
1046 |         return "default"
1047 | 
1048 | node = ParallelSummaries()
1049 | flow = AsyncFlow(start=node)
1050 | ```
1051 | 
1052 | ## AsyncParallelBatchFlow
1053 | 
1054 | Parallel version of **BatchFlow**. Each iteration of the sub-flow runs **concurrently** using different parameters:
1055 | 
1056 | ```python
1057 | class SummarizeMultipleFiles(AsyncParallelBatchFlow):
1058 |     async def prep_async(self, shared):
1059 |         return [{"filename": f} for f in shared["files"]]
1060 | 
1061 | sub_flow = AsyncFlow(start=LoadAndSummarizeFile())
1062 | parallel_flow = SummarizeMultipleFiles(start=sub_flow)
1063 | await parallel_flow.run_async(shared)
1064 | ```
1065 | 
1066 | ================================================
1067 | File: docs/design_pattern/agent.md
1068 | ================================================
1069 | ---
1070 | layout: default
1071 | title: "Agent"
1072 | parent: "Design Pattern"
1073 | nav_order: 1
1074 | ---
1075 | 
1076 | # Agent
1077 | 
1078 | Agent is a powerful design pattern in which nodes can take dynamic actions based on the context.
1079 | 
1080 | <div align="center">
1081 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/agent.png?raw=true" width="350"/>
1082 | </div>
1083 | 
1084 | ## Implement Agent with Graph
1085 | 
1086 | 1. **Context and Action:** Implement nodes that supply context and perform actions.  
1087 | 2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step.
1088 | 3. **Agent Node:** Provide a prompt to decide action—for example:
1089 | 
1090 | ```python
1091 | f"""
1092 | ### CONTEXT
1093 | Task: {task_description}
1094 | Previous Actions: {previous_actions}
1095 | Current State: {current_state}
1096 | 
1097 | ### ACTION SPACE
1098 | [1] search
1099 |   Description: Use web search to get results
1100 |   Parameters:
1101 |     - query (str): What to search for
1102 | 
1103 | [2] answer
1104 |   Description: Conclude based on the results
1105 |   Parameters:
1106 |     - result (str): Final answer to provide
1107 | 
1108 | ### NEXT ACTION
1109 | Decide the next action based on the current context and available action space.
1110 | Return your response in the following format:
1111 | 
1112 | ```yaml
1113 | thinking: |
1114 |     <your step-by-step reasoning process>
1115 | action: <action_name>
1116 | parameters:
1117 |     <parameter_name>: <parameter_value>
1118 | ```"""
1119 | ```
1120 | 
1121 | The core of building **high-performance** and **reliable** agents boils down to:
1122 | 
1123 | 1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](./rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](https://arxiv.org/abs/2307.03172), overlooking mid-prompt content.
1124 | 
1125 | 2. **Action Space:** Provide *a well-structured and unambiguous* set of actions—avoiding overlap like separate `read_databases` or  `read_csvs`. Instead, import CSVs into the database.
1126 | 
1127 | ## Example Good Action Design
1128 | 
1129 | - **Incremental:** Feed content in manageable chunks (500 lines or 1 page) instead of all at once.
1130 | 
1131 | - **Overview-zoom-in:** First provide high-level structure (table of contents, summary), then allow drilling into details (raw texts).
1132 | 
1133 | - **Parameterized/Programmable:** Instead of fixed actions, enable parameterized (columns to select) or programmable (SQL queries) actions, for example, to read CSV files.
1134 | 
1135 | - **Backtracking:** Let the agent undo the last step instead of restarting entirely, preserving progress when encountering errors or dead ends.
1136 | 
1137 | ## Example: Search Agent
1138 | 
1139 | This agent:
1140 | 1. Decides whether to search or answer
1141 | 2. If searches, loops back to decide if more search needed
1142 | 3. Answers when enough context gathered
1143 | 
1144 | ```python
1145 | class DecideAction(Node):
1146 |     def prep(self, shared):
1147 |         context = shared.get("context", "No previous search")
1148 |         query = shared["query"]
1149 |         return query, context
1150 |         
1151 |     def exec(self, inputs):
1152 |         query, context = inputs
1153 |         prompt = f"""
1154 | Given input: {query}
1155 | Previous search results: {context}
1156 | Should I: 1) Search web for more info 2) Answer with current knowledge
1157 | Output in yaml:
1158 | ```yaml
1159 | action: search/answer
1160 | reason: why this action
1161 | search_term: search phrase if action is search
1162 | ```"""
1163 |         resp = call_llm(prompt)
1164 |         yaml_str = resp.split("```yaml")[1].split("```")[0].strip()
1165 |         result = yaml.safe_load(yaml_str)
1166 |         
1167 |         assert isinstance(result, dict)
1168 |         assert "action" in result
1169 |         assert "reason" in result
1170 |         assert result["action"] in ["search", "answer"]
1171 |         if result["action"] == "search":
1172 |             assert "search_term" in result
1173 |         
1174 |         return result
1175 | 
1176 |     def post(self, shared, prep_res, exec_res):
1177 |         if exec_res["action"] == "search":
1178 |             shared["search_term"] = exec_res["search_term"]
1179 |         return exec_res["action"]
1180 | 
1181 | class SearchWeb(Node):
1182 |     def prep(self, shared):
1183 |         return shared["search_term"]
1184 |         
1185 |     def exec(self, search_term):
1186 |         return search_web(search_term)
1187 |     
1188 |     def post(self, shared, prep_res, exec_res):
1189 |         prev_searches = shared.get("context", [])
1190 |         shared["context"] = prev_searches + [
1191 |             {"term": shared["search_term"], "result": exec_res}
1192 |         ]
1193 |         return "decide"
1194 |         
1195 | class DirectAnswer(Node):
1196 |     def prep(self, shared):
1197 |         return shared["query"], shared.get("context", "")
1198 |         
1199 |     def exec(self, inputs):
1200 |         query, context = inputs
1201 |         return call_llm(f"Context: {context}\nAnswer: {query}")
1202 | 
1203 |     def post(self, shared, prep_res, exec_res):
1204 |        print(f"Answer: {exec_res}")
1205 |        shared["answer"] = exec_res
1206 | 
1207 | # Connect nodes
1208 | decide = DecideAction()
1209 | search = SearchWeb()
1210 | answer = DirectAnswer()
1211 | 
1212 | decide - "search" >> search
1213 | decide - "answer" >> answer
1214 | search - "decide" >> decide  # Loop back
1215 | 
1216 | flow = Flow(start=decide)
1217 | flow.run({"query": "Who won the Nobel Prize in Physics 2024?"})
1218 | ```
1219 | 
1220 | ================================================
1221 | File: docs/design_pattern/mapreduce.md
1222 | ================================================
1223 | ---
1224 | layout: default
1225 | title: "Map Reduce"
1226 | parent: "Design Pattern"
1227 | nav_order: 4
1228 | ---
1229 | 
1230 | # Map Reduce
1231 | 
1232 | MapReduce is a design pattern suitable when you have either:
1233 | - Large input data (e.g., multiple files to process), or
1234 | - Large output data (e.g., multiple forms to fill)
1235 | 
1236 | and there is a logical way to break the task into smaller, ideally independent parts. 
1237 | 
1238 | <div align="center">
1239 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/mapreduce.png?raw=true" width="400"/>
1240 | </div>
1241 | 
1242 | You first break down the task using [BatchNode](../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase.
1243 | 
1244 | ### Example: Document Summarization
1245 | 
1246 | ```python
1247 | class SummarizeAllFiles(BatchNode):
1248 |     def prep(self, shared):
1249 |         files_dict = shared["files"]  # e.g. 10 files
1250 |         return list(files_dict.items())  # [("file1.txt", "aaa..."), ("file2.txt", "bbb..."), ...]
1251 | 
1252 |     def exec(self, one_file):
1253 |         filename, file_content = one_file
1254 |         summary_text = call_llm(f"Summarize the following file:\n{file_content}")
1255 |         return (filename, summary_text)
1256 | 
1257 |     def post(self, shared, prep_res, exec_res_list):
1258 |         shared["file_summaries"] = dict(exec_res_list)
1259 | 
1260 | class CombineSummaries(Node):
1261 |     def prep(self, shared):
1262 |         return shared["file_summaries"]
1263 | 
1264 |     def exec(self, file_summaries):
1265 |         # format as: "File1: summary\nFile2: summary...\n"
1266 |         text_list = []
1267 |         for fname, summ in file_summaries.items():
1268 |             text_list.append(f"{fname} summary:\n{summ}\n")
1269 |         big_text = "\n---\n".join(text_list)
1270 | 
1271 |         return call_llm(f"Combine these file summaries into one final summary:\n{big_text}")
1272 | 
1273 |     def post(self, shared, prep_res, final_summary):
1274 |         shared["all_files_summary"] = final_summary
1275 | 
1276 | batch_node = SummarizeAllFiles()
1277 | combine_node = CombineSummaries()
1278 | batch_node >> combine_node
1279 | 
1280 | flow = Flow(start=batch_node)
1281 | 
1282 | shared = {
1283 |     "files": {
1284 |         "file1.txt": "Alice was beginning to get very tired of sitting by her sister...",
1285 |         "file2.txt": "Some other interesting text ...",
1286 |         # ...
1287 |     }
1288 | }
1289 | flow.run(shared)
1290 | print("Individual Summaries:", shared["file_summaries"])
1291 | print("\nFinal Summary:\n", shared["all_files_summary"])
1292 | ```
1293 | 
1294 | ================================================
1295 | File: docs/design_pattern/rag.md
1296 | ================================================
1297 | ---
1298 | layout: default
1299 | title: "RAG"
1300 | parent: "Design Pattern"
1301 | nav_order: 3
1302 | ---
1303 | 
1304 | # RAG (Retrieval Augmented Generation)
1305 | 
1306 | For certain LLM tasks like answering questions, providing relevant context is essential. One common architecture is a **two-stage** RAG pipeline:
1307 | 
1308 | <div align="center">
1309 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/rag.png?raw=true" width="400"/>
1310 | </div>
1311 | 
1312 | 1. **Offline stage**: Preprocess and index documents ("building the index").
1313 | 2. **Online stage**: Given a question, generate answers by retrieving the most relevant context.
1314 | 
1315 | ---
1316 | ## Stage 1: Offline Indexing
1317 | 
1318 | We create three Nodes:
1319 | 1. `ChunkDocs` – [chunks](../utility_function/chunking.md) raw text.
1320 | 2. `EmbedDocs` – [embeds](../utility_function/embedding.md) each chunk.
1321 | 3. `StoreIndex` – stores embeddings into a [vector database](../utility_function/vector.md).
1322 | 
1323 | ```python
1324 | class ChunkDocs(BatchNode):
1325 |     def prep(self, shared):
1326 |         # A list of file paths in shared["files"]. We process each file.
1327 |         return shared["files"]
1328 | 
1329 |     def exec(self, filepath):
1330 |         # read file content. In real usage, do error handling.
1331 |         with open(filepath, "r", encoding="utf-8") as f:
1332 |             text = f.read()
1333 |         # chunk by 100 chars each
1334 |         chunks = []
1335 |         size = 100
1336 |         for i in range(0, len(text), size):
1337 |             chunks.append(text[i : i + size])
1338 |         return chunks
1339 |     
1340 |     def post(self, shared, prep_res, exec_res_list):
1341 |         # exec_res_list is a list of chunk-lists, one per file.
1342 |         # flatten them all into a single list of chunks.
1343 |         all_chunks = []
1344 |         for chunk_list in exec_res_list:
1345 |             all_chunks.extend(chunk_list)
1346 |         shared["all_chunks"] = all_chunks
1347 | 
1348 | class EmbedDocs(BatchNode):
1349 |     def prep(self, shared):
1350 |         return shared["all_chunks"]
1351 | 
1352 |     def exec(self, chunk):
1353 |         return get_embedding(chunk)
1354 | 
1355 |     def post(self, shared, prep_res, exec_res_list):
1356 |         # Store the list of embeddings.
1357 |         shared["all_embeds"] = exec_res_list
1358 |         print(f"Total embeddings: {len(exec_res_list)}")
1359 | 
1360 | class StoreIndex(Node):
1361 |     def prep(self, shared):
1362 |         # We'll read all embeds from shared.
1363 |         return shared["all_embeds"]
1364 | 
1365 |     def exec(self, all_embeds):
1366 |         # Create a vector index (faiss or other DB in real usage).
1367 |         index = create_index(all_embeds)
1368 |         return index
1369 | 
1370 |     def post(self, shared, prep_res, index):
1371 |         shared["index"] = index
1372 | 
1373 | # Wire them in sequence
1374 | chunk_node = ChunkDocs()
1375 | embed_node = EmbedDocs()
1376 | store_node = StoreIndex()
1377 | 
1378 | chunk_node >> embed_node >> store_node
1379 | 
1380 | OfflineFlow = Flow(start=chunk_node)
1381 | ```
1382 | 
1383 | Usage example:
1384 | 
1385 | ```python
1386 | shared = {
1387 |     "files": ["doc1.txt", "doc2.txt"],  # any text files
1388 | }
1389 | OfflineFlow.run(shared)
1390 | ```
1391 | 
1392 | ---
1393 | ## Stage 2: Online Query & Answer
1394 | 
1395 | We have 3 nodes:
1396 | 1. `EmbedQuery` – embeds the user’s question.
1397 | 2. `RetrieveDocs` – retrieves top chunk from the index.
1398 | 3. `GenerateAnswer` – calls the LLM with the question + chunk to produce the final answer.
1399 | 
1400 | ```python
1401 | class EmbedQuery(Node):
1402 |     def prep(self, shared):
1403 |         return shared["question"]
1404 | 
1405 |     def exec(self, question):
1406 |         return get_embedding(question)
1407 | 
1408 |     def post(self, shared, prep_res, q_emb):
1409 |         shared["q_emb"] = q_emb
1410 | 
1411 | class RetrieveDocs(Node):
1412 |     def prep(self, shared):
1413 |         # We'll need the query embedding, plus the offline index/chunks
1414 |         return shared["q_emb"], shared["index"], shared["all_chunks"]
1415 | 
1416 |     def exec(self, inputs):
1417 |         q_emb, index, chunks = inputs
1418 |         I, D = search_index(index, q_emb, top_k=1)
1419 |         best_id = I[0][0]
1420 |         relevant_chunk = chunks[best_id]
1421 |         return relevant_chunk
1422 | 
1423 |     def post(self, shared, prep_res, relevant_chunk):
1424 |         shared["retrieved_chunk"] = relevant_chunk
1425 |         print("Retrieved chunk:", relevant_chunk[:60], "...")
1426 | 
1427 | class GenerateAnswer(Node):
1428 |     def prep(self, shared):
1429 |         return shared["question"], shared["retrieved_chunk"]
1430 | 
1431 |     def exec(self, inputs):
1432 |         question, chunk = inputs
1433 |         prompt = f"Question: {question}\nContext: {chunk}\nAnswer:"
1434 |         return call_llm(prompt)
1435 | 
1436 |     def post(self, shared, prep_res, answer):
1437 |         shared["answer"] = answer
1438 |         print("Answer:", answer)
1439 | 
1440 | embed_qnode = EmbedQuery()
1441 | retrieve_node = RetrieveDocs()
1442 | generate_node = GenerateAnswer()
1443 | 
1444 | embed_qnode >> retrieve_node >> generate_node
1445 | OnlineFlow = Flow(start=embed_qnode)
1446 | ```
1447 | 
1448 | Usage example:
1449 | 
1450 | ```python
1451 | # Suppose we already ran OfflineFlow and have:
1452 | # shared["all_chunks"], shared["index"], etc.
1453 | shared["question"] = "Why do people like cats?"
1454 | 
1455 | OnlineFlow.run(shared)
1456 | # final answer in shared["answer"]
1457 | ```
1458 | 
1459 | ================================================
1460 | File: docs/design_pattern/structure.md
1461 | ================================================
1462 | ---
1463 | layout: default
1464 | title: "Structured Output"
1465 | parent: "Design Pattern"
1466 | nav_order: 5
1467 | ---
1468 | 
1469 | # Structured Output
1470 | 
1471 | In many use cases, you may want the LLM to output a specific structure, such as a list or a dictionary with predefined keys.
1472 | 
1473 | There are several approaches to achieve a structured output:
1474 | - **Prompting** the LLM to strictly return a defined structure.
1475 | - Using LLMs that natively support **schema enforcement**.
1476 | - **Post-processing** the LLM's response to extract structured content.
1477 | 
1478 | In practice, **Prompting** is simple and reliable for modern LLMs.
1479 | 
1480 | ### Example Use Cases
1481 | 
1482 | - Extracting Key Information 
1483 | 
1484 | ```yaml
1485 | product:
1486 |   name: Widget Pro
1487 |   price: 199.99
1488 |   description: |
1489 |     A high-quality widget designed for professionals.
1490 |     Recommended for advanced users.
1491 | ```
1492 | 
1493 | - Summarizing Documents into Bullet Points
1494 | 
1495 | ```yaml
1496 | summary:
1497 |   - This product is easy to use.
1498 |   - It is cost-effective.
1499 |   - Suitable for all skill levels.
1500 | ```
1501 | 
1502 | - Generating Configuration Files
1503 | 
1504 | ```yaml
1505 | server:
1506 |   host: 127.0.0.1
1507 |   port: 8080
1508 |   ssl: true
1509 | ```
1510 | 
1511 | ## Prompt Engineering
1512 | 
1513 | When prompting the LLM to produce **structured** output:
1514 | 1. **Wrap** the structure in code fences (e.g., `yaml`).
1515 | 2. **Validate** that all required fields exist (and let `Node` handles retry).
1516 | 
1517 | ### Example Text Summarization
1518 | 
1519 | ```python
1520 | class SummarizeNode(Node):
1521 |     def exec(self, prep_res):
1522 |         # Suppose `prep_res` is the text to summarize.
1523 |         prompt = f"""
1524 | Please summarize the following text as YAML, with exactly 3 bullet points
1525 | 
1526 | {prep_res}
1527 | 
1528 | Now, output:
1529 | ```yaml
1530 | summary:
1531 |   - bullet 1
1532 |   - bullet 2
1533 |   - bullet 3
1534 | ```"""
1535 |         response = call_llm(prompt)
1536 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
1537 | 
1538 |         import yaml
1539 |         structured_result = yaml.safe_load(yaml_str)
1540 | 
1541 |         assert "summary" in structured_result
1542 |         assert isinstance(structured_result["summary"], list)
1543 | 
1544 |         return structured_result
1545 | ```
1546 | 
1547 | > Besides using `assert` statements, another popular way to validate schemas is [Pydantic](https://github.com/pydantic/pydantic)
1548 | {: .note }
1549 | 
1550 | ### Why YAML instead of JSON?
1551 | 
1552 | Current LLMs struggle with escaping. YAML is easier with strings since they don't always need quotes.
1553 | 
1554 | **In JSON**  
1555 | 
1556 | ```json
1557 | {
1558 |   "dialogue": "Alice said: \"Hello Bob.\\nHow are you?\\nI am good.\""
1559 | }
1560 | ```
1561 | 
1562 | - Every double quote inside the string must be escaped with `\"`.
1563 | - Each newline in the dialogue must be represented as `\n`.
1564 | 
1565 | **In YAML**  
1566 | 
1567 | ```yaml
1568 | dialogue: |
1569 |   Alice said: "Hello Bob.
1570 |   How are you?
1571 |   I am good."
1572 | ```
1573 | 
1574 | - No need to escape interior quotes—just place the entire text under a block literal (`|`).
1575 | - Newlines are naturally preserved without needing `\n`.
1576 | 
1577 | ================================================
1578 | File: docs/design_pattern/workflow.md
1579 | ================================================
1580 | ---
1581 | layout: default
1582 | title: "Workflow"
1583 | parent: "Design Pattern"
1584 | nav_order: 2
1585 | ---
1586 | 
1587 | # Workflow
1588 | 
1589 | Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](../core_abstraction/flow.md) of multiple Nodes.
1590 | 
1591 | <div align="center">
1592 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/workflow.png?raw=true" width="400"/>
1593 | </div>
1594 | 
1595 | > - You don't want to make each task **too coarse**, because it may be *too complex for one LLM call*.
1596 | > - You don't want to make each task **too granular**, because then *the LLM call doesn't have enough context* and results are *not consistent across nodes*.
1597 | > 
1598 | > You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](./agent.md).
1599 | {: .best-practice }
1600 | 
1601 | ### Example: Article Writing
1602 | 
1603 | ```python
1604 | class GenerateOutline(Node):
1605 |     def prep(self, shared): return shared["topic"]
1606 |     def exec(self, topic): return call_llm(f"Create a detailed outline for an article about {topic}")
1607 |     def post(self, shared, prep_res, exec_res): shared["outline"] = exec_res
1608 | 
1609 | class WriteSection(Node):
1610 |     def prep(self, shared): return shared["outline"]
1611 |     def exec(self, outline): return call_llm(f"Write content based on this outline: {outline}")
1612 |     def post(self, shared, prep_res, exec_res): shared["draft"] = exec_res
1613 | 
1614 | class ReviewAndRefine(Node):
1615 |     def prep(self, shared): return shared["draft"]
1616 |     def exec(self, draft): return call_llm(f"Review and improve this draft: {draft}")
1617 |     def post(self, shared, prep_res, exec_res): shared["final_article"] = exec_res
1618 | 
1619 | # Connect nodes
1620 | outline = GenerateOutline()
1621 | write = WriteSection()
1622 | review = ReviewAndRefine()
1623 | 
1624 | outline >> write >> review
1625 | 
1626 | # Create and run flow
1627 | writing_flow = Flow(start=outline)
1628 | shared = {"topic": "AI Safety"}
1629 | writing_flow.run(shared)
1630 | ```
1631 | 
1632 | For *dynamic cases*, consider using [Agents](./agent.md).
1633 | 
1634 | ================================================
1635 | File: docs/utility_function/llm.md
1636 | ================================================
1637 | ---
1638 | layout: default
1639 | title: "LLM Wrapper"
1640 | parent: "Utility Function"
1641 | nav_order: 1
1642 | ---
1643 | 
1644 | # LLM Wrappers
1645 | 
1646 | Check out libraries like [litellm](https://github.com/BerriAI/litellm). 
1647 | Here, we provide some minimal example implementations:
1648 | 
1649 | 1. OpenAI
1650 |     ```python
1651 |     def call_llm(prompt):
1652 |         from openai import OpenAI
1653 |         client = OpenAI(api_key="YOUR_API_KEY_HERE")
1654 |         r = client.chat.completions.create(
1655 |             model="gpt-4o",
1656 |             messages=[{"role": "user", "content": prompt}]
1657 |         )
1658 |         return r.choices[0].message.content
1659 | 
1660 |     # Example usage
1661 |     call_llm("How are you?")
1662 |     ```
1663 |     > Store the API key in an environment variable like OPENAI_API_KEY for security.
1664 |     {: .best-practice }
1665 | 
1666 | 2. Claude (Anthropic)
1667 |     ```python
1668 |     def call_llm(prompt):
1669 |         from anthropic import Anthropic
1670 |         client = Anthropic(api_key="YOUR_API_KEY_HERE")
1671 |         r = client.messages.create(
1672 |             model="claude-sonnet-4-0",
1673 |             messages=[
1674 |                 {"role": "user", "content": prompt}
1675 |             ]
1676 |         )
1677 |         return r.content[0].text
1678 |     ```
1679 | 
1680 | 3. Google (Generative AI Studio / PaLM API)
1681 |     ```python
1682 |     def call_llm(prompt):
1683 |     from google import genai
1684 |     client = genai.Client(api_key='GEMINI_API_KEY')
1685 |         response = client.models.generate_content(
1686 |         model='gemini-2.5-pro',
1687 |         contents=prompt
1688 |     )
1689 |     return response.text
1690 |     ```
1691 | 
1692 | 4. Azure (Azure OpenAI)
1693 |     ```python
1694 |     def call_llm(prompt):
1695 |         from openai import AzureOpenAI
1696 |         client = AzureOpenAI(
1697 |             azure_endpoint="https://<YOUR_RESOURCE_NAME>.openai.azure.com/",
1698 |             api_key="YOUR_API_KEY_HERE",
1699 |             api_version="2023-05-15"
1700 |         )
1701 |         r = client.chat.completions.create(
1702 |             model="<YOUR_DEPLOYMENT_NAME>",
1703 |             messages=[{"role": "user", "content": prompt}]
1704 |         )
1705 |         return r.choices[0].message.content
1706 |     ```
1707 | 
1708 | 5. Ollama (Local LLM)
1709 |     ```python
1710 |     def call_llm(prompt):
1711 |         from ollama import chat
1712 |         response = chat(
1713 |             model="llama2",
1714 |             messages=[{"role": "user", "content": prompt}]
1715 |         )
1716 |         return response.message.content
1717 |     ```
1718 | 
1719 | ## Improvements
1720 | Feel free to enhance your `call_llm` function as needed. Here are examples:
1721 | 
1722 | - Handle chat history:
1723 | 
1724 | ```python
1725 | def call_llm(messages):
1726 |     from openai import OpenAI
1727 |     client = OpenAI(api_key="YOUR_API_KEY_HERE")
1728 |     r = client.chat.completions.create(
1729 |         model="gpt-4o",
1730 |         messages=messages
1731 |     )
1732 |     return r.choices[0].message.content
1733 | ```
1734 | 
1735 | - Add in-memory caching 
1736 | 
1737 | ```python
1738 | from functools import lru_cache
1739 | 
1740 | @lru_cache(maxsize=1000)
1741 | def call_llm(prompt):
1742 |     # Your implementation here
1743 |     pass
1744 | ```
1745 | 
1746 | > ⚠️ Caching conflicts with Node retries, as retries yield the same result.
1747 | >
1748 | > To address this, you could use cached results only if not retried.
1749 | {: .warning }
1750 | 
1751 | 
1752 | ```python
1753 | from functools import lru_cache
1754 | 
1755 | @lru_cache(maxsize=1000)
1756 | def cached_call(prompt):
1757 |     pass
1758 | 
1759 | def call_llm(prompt, use_cache):
1760 |     if use_cache:
1761 |         return cached_call(prompt)
1762 |     # Call the underlying function directly
1763 |     return cached_call.__wrapped__(prompt)
1764 | 
1765 | class SummarizeNode(Node):
1766 |     def exec(self, text):
1767 |         return call_llm(f"Summarize: {text}", self.cur_retry==0)
1768 | ```
1769 | 
1770 | - Enable logging:
1771 | 
1772 | ```python
1773 | def call_llm(prompt):
1774 |     import logging
1775 |     logging.info(f"Prompt: {prompt}")
1776 |     response = ... # Your implementation here
1777 |     logging.info(f"Response: {response}")
1778 |     return response
1779 | ```


--------------------------------------------------------------------------------
/.cursorrules:
--------------------------------------------------------------------------------
   1 | ---
   2 | layout: default
   3 | title: "Agentic Coding"
   4 | ---
   5 | 
   6 | # Agentic Coding: Humans Design, Agents code!
   7 | 
   8 | > If you are an AI agent involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification.
   9 | {: .warning }
  10 | 
  11 | ## Agentic Coding Steps
  12 | 
  13 | Agentic Coding should be a collaboration between Human System Design and Agent Implementation:
  14 | 
  15 | | Steps                  | Human      | AI        | Comment                                                                 |
  16 | |:-----------------------|:----------:|:---------:|:------------------------------------------------------------------------|
  17 | | 1. Requirements | ★★★ High  | ★☆☆ Low   | Humans understand the requirements and context.                    |
  18 | | 2. Flow          | ★★☆ Medium | ★★☆ Medium |  Humans specify the high-level design, and the AI fills in the details. |
  19 | | 3. Utilities   | ★★☆ Medium | ★★☆ Medium | Humans provide available external APIs and integrations, and the AI helps with implementation. |
  20 | | 4. Data          | ★☆☆ Low    | ★★★ High   | AI designs the data schema, and humans verify.                            |
  21 | | 5. Node          | ★☆☆ Low   | ★★★ High  | The AI helps design the node based on the flow.          |
  22 | | 6. Implementation      | ★☆☆ Low   | ★★★ High  |  The AI implements the flow based on the design. |
  23 | | 7. Optimization        | ★★☆ Medium | ★★☆ Medium | Humans evaluate the results, and the AI helps optimize. |
  24 | | 8. Reliability         | ★☆☆ Low   | ★★★ High  |  The AI writes test cases and addresses corner cases.     |
  25 | 
  26 | 1. **Requirements**: Clarify the requirements for your project, and evaluate whether an AI system is a good fit. 
  27 |     - Understand AI systems' strengths and limitations:
  28 |       - **Good for**: Routine tasks requiring common sense (filling forms, replying to emails)
  29 |       - **Good for**: Creative tasks with well-defined inputs (building slides, writing SQL)
  30 |       - **Not good for**: Ambiguous problems requiring complex decision-making (business strategy, startup planning)
  31 |     - **Keep It User-Centric:** Explain the "problem" from the user's perspective rather than just listing features.
  32 |     - **Balance complexity vs. impact**: Aim to deliver the highest value features with minimal complexity early.
  33 | 
  34 | 2. **Flow Design**: Outline at a high level, describe how your AI system orchestrates nodes.
  35 |     - Identify applicable design patterns (e.g., [Map Reduce](./design_pattern/mapreduce.md), [Agent](./design_pattern/agent.md), [RAG](./design_pattern/rag.md)).
  36 |       - For each node in the flow, start with a high-level one-line description of what it does.
  37 |       - If using **Map Reduce**, specify how to map (what to split) and how to reduce (how to combine).
  38 |       - If using **Agent**, specify what are the inputs (context) and what are the possible actions.
  39 |       - If using **RAG**, specify what to embed, noting that there's usually both offline (indexing) and online (retrieval) workflows.
  40 |     - Outline the flow and draw it in a mermaid diagram. For example:
  41 |       ```mermaid
  42 |       flowchart LR
  43 |           start[Start] --> batch[Batch]
  44 |           batch --> check[Check]
  45 |           check -->|OK| process
  46 |           check -->|Error| fix[Fix]
  47 |           fix --> check
  48 |           
  49 |           subgraph process[Process]
  50 |             step1[Step 1] --> step2[Step 2]
  51 |           end
  52 |           
  53 |           process --> endNode[End]
  54 |       ```
  55 |     - > **If Humans can't specify the flow, AI Agents can't automate it!** Before building an LLM system, thoroughly understand the problem and potential solution by manually solving example inputs to develop intuition.  
  56 |       {: .best-practice }
  57 | 
  58 | 3. **Utilities**: Based on the Flow Design, identify and implement necessary utility functions.
  59 |     - Think of your AI system as the brain. It needs a body—these *external utility functions*—to interact with the real world:
  60 |         <div align="center"><img src="https://github.com/the-pocket/.github/raw/main/assets/utility.png?raw=true" width="400"/></div>
  61 | 
  62 |         - Reading inputs (e.g., retrieving Slack messages, reading emails)
  63 |         - Writing outputs (e.g., generating reports, sending emails)
  64 |         - Using external tools (e.g., calling LLMs, searching the web)
  65 |         - **NOTE**: *LLM-based tasks* (e.g., summarizing text, analyzing sentiment) are **NOT** utility functions; rather, they are *core functions* internal in the AI system.
  66 |     - For each utility function, implement it and write a simple test.
  67 |     - Document their input/output, as well as why they are necessary. For example:
  68 |       - `name`: `get_embedding` (`utils/get_embedding.py`)
  69 |       - `input`: `str`
  70 |       - `output`: a vector of 3072 floats
  71 |       - `necessity`: Used by the second node to embed text
  72 |     - Example utility implementation:
  73 |       ```python
  74 |       # utils/call_llm.py
  75 |       from openai import OpenAI
  76 | 
  77 |       def call_llm(prompt):    
  78 |           client = OpenAI(api_key="YOUR_API_KEY_HERE")
  79 |           r = client.chat.completions.create(
  80 |               model="gpt-4o",
  81 |               messages=[{"role": "user", "content": prompt}]
  82 |           )
  83 |           return r.choices[0].message.content
  84 |           
  85 |       if __name__ == "__main__":
  86 |           prompt = "What is the meaning of life?"
  87 |           print(call_llm(prompt))
  88 |       ```
  89 |     - > **Sometimes, design Utilities before Flow:**  For example, for an LLM project to automate a legacy system, the bottleneck will likely be the available interface to that system. Start by designing the hardest utilities for interfacing, and then build the flow around them.
  90 |       {: .best-practice }
  91 |     - > **Avoid Exception Handling in Utilities**: If a utility function is called from a Node's `exec()` method, avoid using `try...except` blocks within the utility. Let the Node's built-in retry mechanism handle failures.
  92 |       {: .warning }
  93 | 
  94 | 4. **Data Design**: Design the shared store that nodes will use to communicate.
  95 |    - One core design principle for PocketFlow is to use a well-designed [shared store](./core_abstraction/communication.md)—a data contract that all nodes agree upon to retrieve and store data.
  96 |       - For simple systems, use an in-memory dictionary.
  97 |       - For more complex systems or when persistence is required, use a database.
  98 |       - **Don't Repeat Yourself**: Use in-memory references or foreign keys.
  99 |       - Example shared store design:
 100 |         ```python
 101 |         shared = {
 102 |             "user": {
 103 |                 "id": "user123",
 104 |                 "context": {                # Another nested dict
 105 |                     "weather": {"temp": 72, "condition": "sunny"},
 106 |                     "location": "San Francisco"
 107 |                 }
 108 |             },
 109 |             "results": {}                   # Empty dict to store outputs
 110 |         }
 111 |         ```
 112 | 
 113 | 5. **Node Design**: Plan how each node will read and write data, and use utility functions.
 114 |    - For each [Node](./core_abstraction/node.md), describe its type, how it reads and writes data, and which utility function it uses. Keep it specific but high-level without codes. For example:
 115 |      - `type`: Regular (or Batch, or Async)
 116 |      - `prep`: Read "text" from the shared store
 117 |      - `exec`: Call the embedding utility function. **Avoid exception handling here**; let the Node's retry mechanism manage failures.
 118 |      - `post`: Write "embedding" to the shared store
 119 | 
 120 | 6. **Implementation**: Implement the initial nodes and flows based on the design.
 121 |    - 🎉 If you've reached this step, humans have finished the design. Now *Agentic Coding* begins!
 122 |    - **"Keep it simple, stupid!"** Avoid complex features and full-scale type checking.
 123 |    - **FAIL FAST**! Leverage the built-in [Node](./core_abstraction/node.md) retry and fallback mechanisms to handle failures gracefully. This helps you quickly identify weak points in the system.
 124 |    - Add logging throughout the code to facilitate debugging.
 125 | 
 126 | 7. **Optimization**:
 127 |    - **Use Intuition**: For a quick initial evaluation, human intuition is often a good start.
 128 |    - **Redesign Flow (Back to Step 3)**: Consider breaking down tasks further, introducing agentic decisions, or better managing input contexts.
 129 |    - If your flow design is already solid, move on to micro-optimizations:
 130 |      - **Prompt Engineering**: Use clear, specific instructions with examples to reduce ambiguity.
 131 |      - **In-Context Learning**: Provide robust examples for tasks that are difficult to specify with instructions alone.
 132 | 
 133 |    - > **You'll likely iterate a lot!** Expect to repeat Steps 3–6 hundreds of times.
 134 |      >
 135 |      > <div align="center"><img src="https://github.com/the-pocket/.github/raw/main/assets/success.png?raw=true" width="400"/></div>
 136 |      {: .best-practice }
 137 | 
 138 | 8. **Reliability**  
 139 |    - **Node Retries**: Add checks in the node `exec` to ensure outputs meet requirements, and consider increasing `max_retries` and `wait` times.
 140 |    - **Logging and Visualization**: Maintain logs of all attempts and visualize node results for easier debugging.
 141 |    - **Self-Evaluation**: Add a separate node (powered by an LLM) to review outputs when results are uncertain.
 142 | 
 143 | ## Example LLM Project File Structure
 144 | 
 145 | ```
 146 | my_project/
 147 | ├── main.py
 148 | ├── nodes.py
 149 | ├── flow.py
 150 | ├── utils/
 151 | │   ├── __init__.py
 152 | │   ├── call_llm.py
 153 | │   └── search_web.py
 154 | ├── requirements.txt
 155 | └── docs/
 156 |     └── design.md
 157 | ```
 158 | 
 159 | - **`requirements.txt`**: Lists the Python dependencies for the project.
 160 |   ```
 161 |   PyYAML
 162 |   pocketflow
 163 |   ```
 164 | 
 165 | - **`docs/design.md`**: Contains project documentation for each step above. This should be *high-level* and *no-code*.
 166 |   ~~~
 167 |   # Design Doc: Your Project Name
 168 | 
 169 |   > Please DON'T remove notes for AI
 170 | 
 171 |   ## Requirements
 172 | 
 173 |   > Notes for AI: Keep it simple and clear.
 174 |   > If the requirements are abstract, write concrete user stories
 175 | 
 176 | 
 177 |   ## Flow Design
 178 | 
 179 |   > Notes for AI:
 180 |   > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit.
 181 |   > 2. Present a concise, high-level description of the workflow.
 182 | 
 183 |   ### Applicable Design Pattern:
 184 | 
 185 |   1. Map the file summary into chunks, then reduce these chunks into a final summary.
 186 |   2. Agentic file finder
 187 |     - *Context*: The entire summary of the file
 188 |     - *Action*: Find the file
 189 | 
 190 |   ### Flow high-level Design:
 191 | 
 192 |   1. **First Node**: This node is for ...
 193 |   2. **Second Node**: This node is for ...
 194 |   3. **Third Node**: This node is for ...
 195 | 
 196 |   ```mermaid
 197 |   flowchart TD
 198 |       firstNode[First Node] --> secondNode[Second Node]
 199 |       secondNode --> thirdNode[Third Node]
 200 |   ```
 201 |   ## Utility Functions
 202 | 
 203 |   > Notes for AI:
 204 |   > 1. Understand the utility function definition thoroughly by reviewing the doc.
 205 |   > 2. Include only the necessary utility functions, based on nodes in the flow.
 206 | 
 207 |   1. **Call LLM** (`utils/call_llm.py`)
 208 |     - *Input*: prompt (str)
 209 |     - *Output*: response (str)
 210 |     - Generally used by most nodes for LLM tasks
 211 | 
 212 |   2. **Embedding** (`utils/get_embedding.py`)
 213 |     - *Input*: str
 214 |     - *Output*: a vector of 3072 floats
 215 |     - Used by the second node to embed text
 216 | 
 217 |   ## Node Design
 218 | 
 219 |   ### Shared Store
 220 | 
 221 |   > Notes for AI: Try to minimize data redundancy
 222 | 
 223 |   The shared store structure is organized as follows:
 224 | 
 225 |   ```python
 226 |   shared = {
 227 |       "key": "value"
 228 |   }
 229 |   ```
 230 | 
 231 |   ### Node Steps
 232 | 
 233 |   > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow.
 234 | 
 235 |   1. First Node
 236 |     - *Purpose*: Provide a short explanation of the node’s function
 237 |     - *Type*: Decide between Regular, Batch, or Async
 238 |     - *Steps*:
 239 |       - *prep*: Read "key" from the shared store
 240 |       - *exec*: Call the utility function
 241 |       - *post*: Write "key" to the shared store
 242 | 
 243 |   2. Second Node
 244 |     ...
 245 |   ~~~
 246 | 
 247 | 
 248 | - **`utils/`**: Contains all utility functions.
 249 |   - It's recommended to dedicate one Python file to each API call, for example `call_llm.py` or `search_web.py`.
 250 |   - Each file should also include a `main()` function to try that API call
 251 |   ```python
 252 |   from google import genai
 253 |   import os
 254 | 
 255 |   def call_llm(prompt: str) -> str:
 256 |       client = genai.Client(
 257 |           api_key=os.getenv("GEMINI_API_KEY", ""),
 258 |       )
 259 |       model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
 260 |       response = client.models.generate_content(model=model, contents=[prompt])
 261 |       return response.text
 262 | 
 263 |   if __name__ == "__main__":
 264 |       test_prompt = "Hello, how are you?"
 265 | 
 266 |       # First call - should hit the API
 267 |       print("Making call...")
 268 |       response1 = call_llm(test_prompt, use_cache=False)
 269 |       print(f"Response: {response1}")
 270 |   ```
 271 | 
 272 | - **`nodes.py`**: Contains all the node definitions.
 273 |   ```python
 274 |   # nodes.py
 275 |   from pocketflow import Node
 276 |   from utils.call_llm import call_llm
 277 | 
 278 |   class GetQuestionNode(Node):
 279 |       def exec(self, _):
 280 |           # Get question directly from user input
 281 |           user_question = input("Enter your question: ")
 282 |           return user_question
 283 |       
 284 |       def post(self, shared, prep_res, exec_res):
 285 |           # Store the user's question
 286 |           shared["question"] = exec_res
 287 |           return "default"  # Go to the next node
 288 | 
 289 |   class AnswerNode(Node):
 290 |       def prep(self, shared):
 291 |           # Read question from shared
 292 |           return shared["question"]
 293 |       
 294 |       def exec(self, question):
 295 |           # Call LLM to get the answer
 296 |           return call_llm(question)
 297 |       
 298 |       def post(self, shared, prep_res, exec_res):
 299 |           # Store the answer in shared
 300 |           shared["answer"] = exec_res
 301 |   ```
 302 | - **`flow.py`**: Implements functions that create flows by importing node definitions and connecting them.
 303 |   ```python
 304 |   # flow.py
 305 |   from pocketflow import Flow
 306 |   from nodes import GetQuestionNode, AnswerNode
 307 | 
 308 |   def create_qa_flow():
 309 |       """Create and return a question-answering flow."""
 310 |       # Create nodes
 311 |       get_question_node = GetQuestionNode()
 312 |       answer_node = AnswerNode()
 313 |       
 314 |       # Connect nodes in sequence
 315 |       get_question_node >> answer_node
 316 |       
 317 |       # Create flow starting with input node
 318 |       return Flow(start=get_question_node)
 319 |   ```
 320 | - **`main.py`**: Serves as the project's entry point.
 321 |   ```python
 322 |   # main.py
 323 |   from flow import create_qa_flow
 324 | 
 325 |   # Example main function
 326 |   # Please replace this with your own main function
 327 |   def main():
 328 |       shared = {
 329 |           "question": None,  # Will be populated by GetQuestionNode from user input
 330 |           "answer": None     # Will be populated by AnswerNode
 331 |       }
 332 | 
 333 |       # Create the flow and run it
 334 |       qa_flow = create_qa_flow()
 335 |       qa_flow.run(shared)
 336 |       print(f"Question: {shared['question']}")
 337 |       print(f"Answer: {shared['answer']}")
 338 | 
 339 |   if __name__ == "__main__":
 340 |       main()
 341 |   ```
 342 | 
 343 | ================================================
 344 | File: docs/index.md
 345 | ================================================
 346 | ---
 347 | layout: default
 348 | title: "Home"
 349 | nav_order: 1
 350 | ---
 351 | 
 352 | # Pocket Flow
 353 | 
 354 | A [100-line](https://github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*.
 355 | 
 356 | - **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in.
 357 | - **Expressive**: Everything you love from larger frameworks—([Multi-](./design_pattern/multi_agent.html))[Agents](./design_pattern/agent.html), [Workflow](./design_pattern/workflow.html), [RAG](./design_pattern/rag.html), and more.  
 358 | - **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications.
 359 | 
 360 | <div align="center">
 361 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/meme.jpg?raw=true" width="400"/>
 362 | </div>
 363 | 
 364 | ## Core Abstraction
 365 | 
 366 | We model the LLM workflow as a **Graph + Shared Store**:
 367 | 
 368 | - [Node](./core_abstraction/node.md) handles simple (LLM) tasks.
 369 | - [Flow](./core_abstraction/flow.md) connects nodes through **Actions** (labeled edges).
 370 | - [Shared Store](./core_abstraction/communication.md) enables communication between nodes within flows.
 371 | - [Batch](./core_abstraction/batch.md) nodes/flows allow for data-intensive tasks.
 372 | - [Async](./core_abstraction/async.md) nodes/flows allow waiting for asynchronous tasks.
 373 | - [(Advanced) Parallel](./core_abstraction/parallel.md) nodes/flows handle I/O-bound tasks.
 374 | 
 375 | <div align="center">
 376 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/abstraction.png" width="500"/>
 377 | </div>
 378 | 
 379 | ## Design Pattern
 380 | 
 381 | From there, it’s easy to implement popular design patterns:
 382 | 
 383 | - [Agent](./design_pattern/agent.md) autonomously makes decisions.
 384 | - [Workflow](./design_pattern/workflow.md) chains multiple tasks into pipelines.
 385 | - [RAG](./design_pattern/rag.md) integrates data retrieval with generation.
 386 | - [Map Reduce](./design_pattern/mapreduce.md) splits data tasks into Map and Reduce steps.
 387 | - [Structured Output](./design_pattern/structure.md) formats outputs consistently.
 388 | - [(Advanced) Multi-Agents](./design_pattern/multi_agent.md) coordinate multiple agents.
 389 | 
 390 | <div align="center">
 391 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/design.png" width="500"/>
 392 | </div>
 393 | 
 394 | ## Utility Function
 395 | 
 396 | We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*:
 397 | 
 398 | - [LLM Wrapper](./utility_function/llm.md)
 399 | - [Viz and Debug](./utility_function/viz.md)
 400 | - [Web Search](./utility_function/websearch.md)
 401 | - [Chunking](./utility_function/chunking.md)
 402 | - [Embedding](./utility_function/embedding.md)
 403 | - [Vector Databases](./utility_function/vector.md)
 404 | - [Text-to-Speech](./utility_function/text_to_speech.md)
 405 | 
 406 | **Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework:
 407 | - *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs.
 408 | - *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally.
 409 | - *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in.
 410 | 
 411 | ## Ready to build your Apps? 
 412 | 
 413 | Check out [Agentic Coding Guidance](./guide.md), the fastest way to develop LLM projects with Pocket Flow!
 414 | 
 415 | ================================================
 416 | File: docs/core_abstraction/async.md
 417 | ================================================
 418 | ---
 419 | layout: default
 420 | title: "(Advanced) Async"
 421 | parent: "Core Abstraction"
 422 | nav_order: 5
 423 | ---
 424 | 
 425 | # (Advanced) Async
 426 | 
 427 | **Async** Nodes implement `prep_async()`, `exec_async()`, `exec_fallback_async()`, and/or `post_async()`. This is useful for:
 428 | 
 429 | 1. **prep_async()**: For *fetching/reading data (files, APIs, DB)* in an I/O-friendly way.
 430 | 2. **exec_async()**: Typically used for async LLM calls.
 431 | 3. **post_async()**: For *awaiting user feedback*, *coordinating across multi-agents* or any additional async steps after `exec_async()`.
 432 | 
 433 | **Note**: `AsyncNode` must be wrapped in `AsyncFlow`. `AsyncFlow` can also include regular (sync) nodes.
 434 | 
 435 | ### Example
 436 | 
 437 | ```python
 438 | class SummarizeThenVerify(AsyncNode):
 439 |     async def prep_async(self, shared):
 440 |         # Example: read a file asynchronously
 441 |         doc_text = await read_file_async(shared["doc_path"])
 442 |         return doc_text
 443 | 
 444 |     async def exec_async(self, prep_res):
 445 |         # Example: async LLM call
 446 |         summary = await call_llm_async(f"Summarize: {prep_res}")
 447 |         return summary
 448 | 
 449 |     async def post_async(self, shared, prep_res, exec_res):
 450 |         # Example: wait for user feedback
 451 |         decision = await gather_user_feedback(exec_res)
 452 |         if decision == "approve":
 453 |             shared["summary"] = exec_res
 454 |             return "approve"
 455 |         return "deny"
 456 | 
 457 | summarize_node = SummarizeThenVerify()
 458 | final_node = Finalize()
 459 | 
 460 | # Define transitions
 461 | summarize_node - "approve" >> final_node
 462 | summarize_node - "deny"    >> summarize_node  # retry
 463 | 
 464 | flow = AsyncFlow(start=summarize_node)
 465 | 
 466 | async def main():
 467 |     shared = {"doc_path": "document.txt"}
 468 |     await flow.run_async(shared)
 469 |     print("Final Summary:", shared.get("summary"))
 470 | 
 471 | asyncio.run(main())
 472 | ```
 473 | 
 474 | ================================================
 475 | File: docs/core_abstraction/batch.md
 476 | ================================================
 477 | ---
 478 | layout: default
 479 | title: "Batch"
 480 | parent: "Core Abstraction"
 481 | nav_order: 4
 482 | ---
 483 | 
 484 | # Batch
 485 | 
 486 | **Batch** makes it easier to handle large inputs in one Node or **rerun** a Flow multiple times. Example use cases:
 487 | - **Chunk-based** processing (e.g., splitting large texts).
 488 | - **Iterative** processing over lists of input items (e.g., user queries, files, URLs).
 489 | 
 490 | ## 1. BatchNode
 491 | 
 492 | A **BatchNode** extends `Node` but changes `prep()` and `exec()`:
 493 | 
 494 | - **`prep(shared)`**: returns an **iterable** (e.g., list, generator).
 495 | - **`exec(item)`**: called **once** per item in that iterable.
 496 | - **`post(shared, prep_res, exec_res_list)`**: after all items are processed, receives a **list** of results (`exec_res_list`) and returns an **Action**.
 497 | 
 498 | 
 499 | ### Example: Summarize a Large File
 500 | 
 501 | ```python
 502 | class MapSummaries(BatchNode):
 503 |     def prep(self, shared):
 504 |         # Suppose we have a big file; chunk it
 505 |         content = shared["data"]
 506 |         chunk_size = 10000
 507 |         chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
 508 |         return chunks
 509 | 
 510 |     def exec(self, chunk):
 511 |         prompt = f"Summarize this chunk in 10 words: {chunk}"
 512 |         summary = call_llm(prompt)
 513 |         return summary
 514 | 
 515 |     def post(self, shared, prep_res, exec_res_list):
 516 |         combined = "\n".join(exec_res_list)
 517 |         shared["summary"] = combined
 518 |         return "default"
 519 | 
 520 | map_summaries = MapSummaries()
 521 | flow = Flow(start=map_summaries)
 522 | flow.run(shared)
 523 | ```
 524 | 
 525 | ---
 526 | 
 527 | ## 2. BatchFlow
 528 | 
 529 | A **BatchFlow** runs a **Flow** multiple times, each time with different `params`. Think of it as a loop that replays the Flow for each parameter set.
 530 | 
 531 | ### Example: Summarize Many Files
 532 | 
 533 | ```python
 534 | class SummarizeAllFiles(BatchFlow):
 535 |     def prep(self, shared):
 536 |         # Return a list of param dicts (one per file)
 537 |         filenames = list(shared["data"].keys())  # e.g., ["file1.txt", "file2.txt", ...]
 538 |         return [{"filename": fn} for fn in filenames]
 539 | 
 540 | # Suppose we have a per-file Flow (e.g., load_file >> summarize >> reduce):
 541 | summarize_file = SummarizeFile(start=load_file)
 542 | 
 543 | # Wrap that flow into a BatchFlow:
 544 | summarize_all_files = SummarizeAllFiles(start=summarize_file)
 545 | summarize_all_files.run(shared)
 546 | ```
 547 | 
 548 | ### Under the Hood
 549 | 1. `prep(shared)` returns a list of param dicts—e.g., `[{filename: "file1.txt"}, {filename: "file2.txt"}, ...]`.
 550 | 2. The **BatchFlow** loops through each dict. For each one:
 551 |    - It merges the dict with the BatchFlow’s own `params`.
 552 |    - It calls `flow.run(shared)` using the merged result.
 553 | 3. This means the sub-Flow is run **repeatedly**, once for every param dict.
 554 | 
 555 | ---
 556 | 
 557 | ## 3. Nested or Multi-Level Batches
 558 | 
 559 | You can nest a **BatchFlow** in another **BatchFlow**. For instance:
 560 | - **Outer** batch: returns a list of diretory param dicts (e.g., `{"directory": "/pathA"}`, `{"directory": "/pathB"}`, ...).
 561 | - **Inner** batch: returning a list of per-file param dicts.
 562 | 
 563 | At each level, **BatchFlow** merges its own param dict with the parent’s. By the time you reach the **innermost** node, the final `params` is the merged result of **all** parents in the chain. This way, a nested structure can keep track of the entire context (e.g., directory + file name) at once.
 564 | 
 565 | ```python
 566 | 
 567 | class FileBatchFlow(BatchFlow):
 568 |     def prep(self, shared):
 569 |         directory = self.params["directory"]
 570 |         # e.g., files = ["file1.txt", "file2.txt", ...]
 571 |         files = [f for f in os.listdir(directory) if f.endswith(".txt")]
 572 |         return [{"filename": f} for f in files]
 573 | 
 574 | class DirectoryBatchFlow(BatchFlow):
 575 |     def prep(self, shared):
 576 |         directories = [ "/path/to/dirA", "/path/to/dirB"]
 577 |         return [{"directory": d} for d in directories]
 578 | 
 579 | # MapSummaries have params like {"directory": "/path/to/dirA", "filename": "file1.txt"}
 580 | inner_flow = FileBatchFlow(start=MapSummaries())
 581 | outer_flow = DirectoryBatchFlow(start=inner_flow)
 582 | ```
 583 | 
 584 | ================================================
 585 | File: docs/core_abstraction/communication.md
 586 | ================================================
 587 | ---
 588 | layout: default
 589 | title: "Communication"
 590 | parent: "Core Abstraction"
 591 | nav_order: 3
 592 | ---
 593 | 
 594 | # Communication
 595 | 
 596 | Nodes and Flows **communicate** in 2 ways:
 597 | 
 598 | 1. **Shared Store (for almost all the cases)** 
 599 | 
 600 |    - A global data structure (often an in-mem dict) that all nodes can read ( `prep()`) and write (`post()`).  
 601 |    - Great for data results, large content, or anything multiple nodes need.
 602 |    - You shall design the data structure and populate it ahead.
 603 |      
 604 |    - > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*!  This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](./batch.md).
 605 |      {: .best-practice }
 606 | 
 607 | 2. **Params (only for [Batch](./batch.md))** 
 608 |    - Each node has a local, ephemeral `params` dict passed in by the **parent Flow**, used as an identifier for tasks. Parameter keys and values shall be **immutable**.
 609 |    - Good for identifiers like filenames or numeric IDs, in Batch mode.
 610 | 
 611 | If you know memory management, think of the **Shared Store** like a **heap** (shared by all function calls), and **Params** like a **stack** (assigned by the caller).
 612 | 
 613 | ---
 614 | 
 615 | ## 1. Shared Store
 616 | 
 617 | ### Overview
 618 | 
 619 | A shared store is typically an in-mem dictionary, like:
 620 | ```python
 621 | shared = {"data": {}, "summary": {}, "config": {...}, ...}
 622 | ```
 623 | 
 624 | It can also contain local file handlers, DB connections, or a combination for persistence. We recommend deciding the data structure or DB schema first based on your app requirements.
 625 | 
 626 | ### Example
 627 | 
 628 | ```python
 629 | class LoadData(Node):
 630 |     def post(self, shared, prep_res, exec_res):
 631 |         # We write data to shared store
 632 |         shared["data"] = "Some text content"
 633 |         return None
 634 | 
 635 | class Summarize(Node):
 636 |     def prep(self, shared):
 637 |         # We read data from shared store
 638 |         return shared["data"]
 639 | 
 640 |     def exec(self, prep_res):
 641 |         # Call LLM to summarize
 642 |         prompt = f"Summarize: {prep_res}"
 643 |         summary = call_llm(prompt)
 644 |         return summary
 645 | 
 646 |     def post(self, shared, prep_res, exec_res):
 647 |         # We write summary to shared store
 648 |         shared["summary"] = exec_res
 649 |         return "default"
 650 | 
 651 | load_data = LoadData()
 652 | summarize = Summarize()
 653 | load_data >> summarize
 654 | flow = Flow(start=load_data)
 655 | 
 656 | shared = {}
 657 | flow.run(shared)
 658 | ```
 659 | 
 660 | Here:
 661 | - `LoadData` writes to `shared["data"]`.
 662 | - `Summarize` reads from `shared["data"]`, summarizes, and writes to `shared["summary"]`.
 663 | 
 664 | ---
 665 | 
 666 | ## 2. Params
 667 | 
 668 | **Params** let you store *per-Node* or *per-Flow* config that doesn't need to live in the shared store. They are:
 669 | - **Immutable** during a Node's run cycle (i.e., they don't change mid-`prep->exec->post`).
 670 | - **Set** via `set_params()`.
 671 | - **Cleared** and updated each time a parent Flow calls it.
 672 | 
 673 | > Only set the uppermost Flow params because others will be overwritten by the parent Flow. 
 674 | > 
 675 | > If you need to set child node params, see [Batch](./batch.md).
 676 | {: .warning }
 677 | 
 678 | Typically, **Params** are identifiers (e.g., file name, page number). Use them to fetch the task you assigned or write to a specific part of the shared store.
 679 | 
 680 | ### Example
 681 | 
 682 | ```python
 683 | # 1) Create a Node that uses params
 684 | class SummarizeFile(Node):
 685 |     def prep(self, shared):
 686 |         # Access the node's param
 687 |         filename = self.params["filename"]
 688 |         return shared["data"].get(filename, "")
 689 | 
 690 |     def exec(self, prep_res):
 691 |         prompt = f"Summarize: {prep_res}"
 692 |         return call_llm(prompt)
 693 | 
 694 |     def post(self, shared, prep_res, exec_res):
 695 |         filename = self.params["filename"]
 696 |         shared["summary"][filename] = exec_res
 697 |         return "default"
 698 | 
 699 | # 2) Set params
 700 | node = SummarizeFile()
 701 | 
 702 | # 3) Set Node params directly (for testing)
 703 | node.set_params({"filename": "doc1.txt"})
 704 | node.run(shared)
 705 | 
 706 | # 4) Create Flow
 707 | flow = Flow(start=node)
 708 | 
 709 | # 5) Set Flow params (overwrites node params)
 710 | flow.set_params({"filename": "doc2.txt"})
 711 | flow.run(shared)  # The node summarizes doc2, not doc1
 712 | ```
 713 | 
 714 | ================================================
 715 | File: docs/core_abstraction/flow.md
 716 | ================================================
 717 | ---
 718 | layout: default
 719 | title: "Flow"
 720 | parent: "Core Abstraction"
 721 | nav_order: 2
 722 | ---
 723 | 
 724 | # Flow
 725 | 
 726 | A **Flow** orchestrates a graph of Nodes. You can chain Nodes in a sequence or create branching depending on the **Actions** returned from each Node's `post()`.
 727 | 
 728 | ## 1. Action-based Transitions
 729 | 
 730 | Each Node's `post()` returns an **Action** string. By default, if `post()` doesn't return anything, we treat that as `"default"`.
 731 | 
 732 | You define transitions with the syntax:
 733 | 
 734 | 1. **Basic default transition**: `node_a >> node_b`
 735 |   This means if `node_a.post()` returns `"default"`, go to `node_b`. 
 736 |   (Equivalent to `node_a - "default" >> node_b`)
 737 | 
 738 | 2. **Named action transition**: `node_a - "action_name" >> node_b`
 739 |   This means if `node_a.post()` returns `"action_name"`, go to `node_b`.
 740 | 
 741 | It's possible to create loops, branching, or multi-step flows.
 742 | 
 743 | ## 2. Creating a Flow
 744 | 
 745 | A **Flow** begins with a **start** node. You call `Flow(start=some_node)` to specify the entry point. When you call `flow.run(shared)`, it executes the start node, looks at its returned Action from `post()`, follows the transition, and continues until there's no next node.
 746 | 
 747 | ### Example: Simple Sequence
 748 | 
 749 | Here's a minimal flow of two nodes in a chain:
 750 | 
 751 | ```python
 752 | node_a >> node_b
 753 | flow = Flow(start=node_a)
 754 | flow.run(shared)
 755 | ```
 756 | 
 757 | - When you run the flow, it executes `node_a`.  
 758 | - Suppose `node_a.post()` returns `"default"`.  
 759 | - The flow then sees `"default"` Action is linked to `node_b` and runs `node_b`.  
 760 | - `node_b.post()` returns `"default"` but we didn't define `node_b >> something_else`. So the flow ends there.
 761 | 
 762 | ### Example: Branching & Looping
 763 | 
 764 | Here's a simple expense approval flow that demonstrates branching and looping. The `ReviewExpense` node can return three possible Actions:
 765 | 
 766 | - `"approved"`: expense is approved, move to payment processing
 767 | - `"needs_revision"`: expense needs changes, send back for revision 
 768 | - `"rejected"`: expense is denied, finish the process
 769 | 
 770 | We can wire them like this:
 771 | 
 772 | ```python
 773 | # Define the flow connections
 774 | review - "approved" >> payment        # If approved, process payment
 775 | review - "needs_revision" >> revise   # If needs changes, go to revision
 776 | review - "rejected" >> finish         # If rejected, finish the process
 777 | 
 778 | revise >> review   # After revision, go back for another review
 779 | payment >> finish  # After payment, finish the process
 780 | 
 781 | flow = Flow(start=review)
 782 | ```
 783 | 
 784 | Let's see how it flows:
 785 | 
 786 | 1. If `review.post()` returns `"approved"`, the expense moves to the `payment` node
 787 | 2. If `review.post()` returns `"needs_revision"`, it goes to the `revise` node, which then loops back to `review`
 788 | 3. If `review.post()` returns `"rejected"`, it moves to the `finish` node and stops
 789 | 
 790 | ```mermaid
 791 | flowchart TD
 792 |     review[Review Expense] -->|approved| payment[Process Payment]
 793 |     review -->|needs_revision| revise[Revise Report]
 794 |     review -->|rejected| finish[Finish Process]
 795 | 
 796 |     revise --> review
 797 |     payment --> finish
 798 | ```
 799 | 
 800 | ### Running Individual Nodes vs. Running a Flow
 801 | 
 802 | - `node.run(shared)`: Just runs that node alone (calls `prep->exec->post()`), returns an Action. 
 803 | - `flow.run(shared)`: Executes from the start node, follows Actions to the next node, and so on until the flow can't continue.
 804 | 
 805 | > `node.run(shared)` **does not** proceed to the successor.
 806 | > This is mainly for debugging or testing a single node.
 807 | > 
 808 | > Always use `flow.run(...)` in production to ensure the full pipeline runs correctly.
 809 | {: .warning }
 810 | 
 811 | ## 3. Nested Flows
 812 | 
 813 | A **Flow** can act like a Node, which enables powerful composition patterns. This means you can:
 814 | 
 815 | 1. Use a Flow as a Node within another Flow's transitions.  
 816 | 2. Combine multiple smaller Flows into a larger Flow for reuse.  
 817 | 3. Node `params` will be a merging of **all** parents' `params`.
 818 | 
 819 | ### Flow's Node Methods
 820 | 
 821 | A **Flow** is also a **Node**, so it will run `prep()` and `post()`. However:
 822 | 
 823 | - It **won't** run `exec()`, as its main logic is to orchestrate its nodes.
 824 | - `post()` always receives `None` for `exec_res` and should instead get the flow execution results from the shared store.
 825 | 
 826 | ### Basic Flow Nesting
 827 | 
 828 | Here's how to connect a flow to another node:
 829 | 
 830 | ```python
 831 | # Create a sub-flow
 832 | node_a >> node_b
 833 | subflow = Flow(start=node_a)
 834 | 
 835 | # Connect it to another node
 836 | subflow >> node_c
 837 | 
 838 | # Create the parent flow
 839 | parent_flow = Flow(start=subflow)
 840 | ```
 841 | 
 842 | When `parent_flow.run()` executes:
 843 | 1. It starts `subflow`
 844 | 2. `subflow` runs through its nodes (`node_a->node_b`)
 845 | 3. After `subflow` completes, execution continues to `node_c`
 846 | 
 847 | ### Example: Order Processing Pipeline
 848 | 
 849 | Here's a practical example that breaks down order processing into nested flows:
 850 | 
 851 | ```python
 852 | # Payment processing sub-flow
 853 | validate_payment >> process_payment >> payment_confirmation
 854 | payment_flow = Flow(start=validate_payment)
 855 | 
 856 | # Inventory sub-flow
 857 | check_stock >> reserve_items >> update_inventory
 858 | inventory_flow = Flow(start=check_stock)
 859 | 
 860 | # Shipping sub-flow
 861 | create_label >> assign_carrier >> schedule_pickup
 862 | shipping_flow = Flow(start=create_label)
 863 | 
 864 | # Connect the flows into a main order pipeline
 865 | payment_flow >> inventory_flow >> shipping_flow
 866 | 
 867 | # Create the master flow
 868 | order_pipeline = Flow(start=payment_flow)
 869 | 
 870 | # Run the entire pipeline
 871 | order_pipeline.run(shared_data)
 872 | ```
 873 | 
 874 | This creates a clean separation of concerns while maintaining a clear execution path:
 875 | 
 876 | ```mermaid
 877 | flowchart LR
 878 |     subgraph order_pipeline[Order Pipeline]
 879 |         subgraph paymentFlow["Payment Flow"]
 880 |             A[Validate Payment] --> B[Process Payment] --> C[Payment Confirmation]
 881 |         end
 882 | 
 883 |         subgraph inventoryFlow["Inventory Flow"]
 884 |             D[Check Stock] --> E[Reserve Items] --> F[Update Inventory]
 885 |         end
 886 | 
 887 |         subgraph shippingFlow["Shipping Flow"]
 888 |             G[Create Label] --> H[Assign Carrier] --> I[Schedule Pickup]
 889 |         end
 890 | 
 891 |         paymentFlow --> inventoryFlow
 892 |         inventoryFlow --> shippingFlow
 893 |     end
 894 | ```
 895 | 
 896 | ================================================
 897 | File: docs/core_abstraction/node.md
 898 | ================================================
 899 | ---
 900 | layout: default
 901 | title: "Node"
 902 | parent: "Core Abstraction"
 903 | nav_order: 1
 904 | ---
 905 | 
 906 | # Node
 907 | 
 908 | A **Node** is the smallest building block. Each Node has 3 steps `prep->exec->post`:
 909 | 
 910 | <div align="center">
 911 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/node.png?raw=true" width="400"/>
 912 | </div>
 913 | 
 914 | 1. `prep(shared)`
 915 |    - **Read and preprocess data** from `shared` store. 
 916 |    - Examples: *query DB, read files, or serialize data into a string*.
 917 |    - Return `prep_res`, which is used by `exec()` and `post()`.
 918 | 
 919 | 2. `exec(prep_res)`
 920 |    - **Execute compute logic**, with optional retries and error handling (below).
 921 |    - Examples: *(mostly) LLM calls, remote APIs, tool use*.
 922 |    - ⚠️ This shall be only for compute and **NOT** access `shared`.
 923 |    - ⚠️ If retries enabled, ensure idempotent implementation.
 924 |    - ⚠️ Defer exception handling to the Node's built-in retry mechanism.
 925 |    - Return `exec_res`, which is passed to `post()`.
 926 | 
 927 | 3. `post(shared, prep_res, exec_res)`
 928 |    - **Postprocess and write data** back to `shared`.
 929 |    - Examples: *update DB, change states, log results*.
 930 |    - **Decide the next action** by returning a *string* (`action = "default"` if *None*).
 931 | 
 932 | > **Why 3 steps?** To enforce the principle of *separation of concerns*. The data storage and data processing are operated separately.
 933 | >
 934 | > All steps are *optional*. E.g., you can only implement `prep` and `post` if you just need to process data.
 935 | {: .note }
 936 | 
 937 | ### Fault Tolerance & Retries
 938 | 
 939 | You can **retry** `exec()` if it raises an exception via two parameters when define the Node:
 940 | 
 941 | - `max_retries` (int): Max times to run `exec()`. The default is `1` (**no** retry).
 942 | - `wait` (int): The time to wait (in **seconds**) before next retry. By default, `wait=0` (no waiting). 
 943 | `wait` is helpful when you encounter rate-limits or quota errors from your LLM provider and need to back off.
 944 | 
 945 | ```python 
 946 | my_node = SummarizeFile(max_retries=3, wait=10)
 947 | ```
 948 | 
 949 | When an exception occurs in `exec()`, the Node automatically retries until:
 950 | 
 951 | - It either succeeds, or
 952 | - The Node has retried `max_retries - 1` times already and fails on the last attempt.
 953 | 
 954 | You can get the current retry times (0-based) from `self.cur_retry`.
 955 | 
 956 | ```python 
 957 | class RetryNode(Node):
 958 |     def exec(self, prep_res):
 959 |         print(f"Retry {self.cur_retry} times")
 960 |         raise Exception("Failed")
 961 | ```
 962 | 
 963 | ### Graceful Fallback
 964 | 
 965 | To **gracefully handle** the exception (after all retries) rather than raising it, override:
 966 | 
 967 | ```python 
 968 | def exec_fallback(self, prep_res, exc):
 969 |     raise exc
 970 | ```
 971 | 
 972 | By default, it just re-raises exception. But you can return a fallback result instead, which becomes the `exec_res` passed to `post()`.
 973 | 
 974 | ### Example: Summarize file
 975 | 
 976 | ```python 
 977 | class SummarizeFile(Node):
 978 |     def prep(self, shared):
 979 |         return shared["data"]
 980 | 
 981 |     def exec(self, prep_res):
 982 |         if not prep_res:
 983 |             return "Empty file content"
 984 |         prompt = f"Summarize this text in 10 words: {prep_res}"
 985 |         summary = call_llm(prompt)  # might fail
 986 |         return summary
 987 | 
 988 |     def exec_fallback(self, prep_res, exc):
 989 |         # Provide a simple fallback instead of crashing
 990 |         return "There was an error processing your request."
 991 | 
 992 |     def post(self, shared, prep_res, exec_res):
 993 |         shared["summary"] = exec_res
 994 |         # Return "default" by not returning
 995 | 
 996 | summarize_node = SummarizeFile(max_retries=3)
 997 | 
 998 | # node.run() calls prep->exec->post
 999 | # If exec() fails, it retries up to 3 times before calling exec_fallback()
1000 | action_result = summarize_node.run(shared)
1001 | 
1002 | print("Action returned:", action_result)  # "default"
1003 | print("Summary stored:", shared["summary"])
1004 | ```
1005 | 
1006 | ================================================
1007 | File: docs/core_abstraction/parallel.md
1008 | ================================================
1009 | ---
1010 | layout: default
1011 | title: "(Advanced) Parallel"
1012 | parent: "Core Abstraction"
1013 | nav_order: 6
1014 | ---
1015 | 
1016 | # (Advanced) Parallel
1017 | 
1018 | **Parallel** Nodes and Flows let you run multiple **Async** Nodes and Flows  **concurrently**—for example, summarizing multiple texts at once. This can improve performance by overlapping I/O and compute. 
1019 | 
1020 | > Because of Python’s GIL, parallel nodes and flows can’t truly parallelize CPU-bound tasks (e.g., heavy numerical computations). However, they excel at overlapping I/O-bound work—like LLM calls, database queries, API requests, or file I/O.
1021 | {: .warning }
1022 | 
1023 | > - **Ensure Tasks Are Independent**: If each item depends on the output of a previous item, **do not** parallelize.
1024 | > 
1025 | > - **Beware of Rate Limits**: Parallel calls can **quickly** trigger rate limits on LLM services. You may need a **throttling** mechanism (e.g., semaphores or sleep intervals).
1026 | > 
1027 | > - **Consider Single-Node Batch APIs**: Some LLMs offer a **batch inference** API where you can send multiple prompts in a single call. This is more complex to implement but can be more efficient than launching many parallel requests and mitigates rate limits.
1028 | {: .best-practice }
1029 | 
1030 | ## AsyncParallelBatchNode
1031 | 
1032 | Like **AsyncBatchNode**, but run `exec_async()` in **parallel**:
1033 | 
1034 | ```python
1035 | class ParallelSummaries(AsyncParallelBatchNode):
1036 |     async def prep_async(self, shared):
1037 |         # e.g., multiple texts
1038 |         return shared["texts"]
1039 | 
1040 |     async def exec_async(self, text):
1041 |         prompt = f"Summarize: {text}"
1042 |         return await call_llm_async(prompt)
1043 | 
1044 |     async def post_async(self, shared, prep_res, exec_res_list):
1045 |         shared["summary"] = "\n\n".join(exec_res_list)
1046 |         return "default"
1047 | 
1048 | node = ParallelSummaries()
1049 | flow = AsyncFlow(start=node)
1050 | ```
1051 | 
1052 | ## AsyncParallelBatchFlow
1053 | 
1054 | Parallel version of **BatchFlow**. Each iteration of the sub-flow runs **concurrently** using different parameters:
1055 | 
1056 | ```python
1057 | class SummarizeMultipleFiles(AsyncParallelBatchFlow):
1058 |     async def prep_async(self, shared):
1059 |         return [{"filename": f} for f in shared["files"]]
1060 | 
1061 | sub_flow = AsyncFlow(start=LoadAndSummarizeFile())
1062 | parallel_flow = SummarizeMultipleFiles(start=sub_flow)
1063 | await parallel_flow.run_async(shared)
1064 | ```
1065 | 
1066 | ================================================
1067 | File: docs/design_pattern/agent.md
1068 | ================================================
1069 | ---
1070 | layout: default
1071 | title: "Agent"
1072 | parent: "Design Pattern"
1073 | nav_order: 1
1074 | ---
1075 | 
1076 | # Agent
1077 | 
1078 | Agent is a powerful design pattern in which nodes can take dynamic actions based on the context.
1079 | 
1080 | <div align="center">
1081 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/agent.png?raw=true" width="350"/>
1082 | </div>
1083 | 
1084 | ## Implement Agent with Graph
1085 | 
1086 | 1. **Context and Action:** Implement nodes that supply context and perform actions.  
1087 | 2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step.
1088 | 3. **Agent Node:** Provide a prompt to decide action—for example:
1089 | 
1090 | ```python
1091 | f"""
1092 | ### CONTEXT
1093 | Task: {task_description}
1094 | Previous Actions: {previous_actions}
1095 | Current State: {current_state}
1096 | 
1097 | ### ACTION SPACE
1098 | [1] search
1099 |   Description: Use web search to get results
1100 |   Parameters:
1101 |     - query (str): What to search for
1102 | 
1103 | [2] answer
1104 |   Description: Conclude based on the results
1105 |   Parameters:
1106 |     - result (str): Final answer to provide
1107 | 
1108 | ### NEXT ACTION
1109 | Decide the next action based on the current context and available action space.
1110 | Return your response in the following format:
1111 | 
1112 | ```yaml
1113 | thinking: |
1114 |     <your step-by-step reasoning process>
1115 | action: <action_name>
1116 | parameters:
1117 |     <parameter_name>: <parameter_value>
1118 | ```"""
1119 | ```
1120 | 
1121 | The core of building **high-performance** and **reliable** agents boils down to:
1122 | 
1123 | 1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](./rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](https://arxiv.org/abs/2307.03172), overlooking mid-prompt content.
1124 | 
1125 | 2. **Action Space:** Provide *a well-structured and unambiguous* set of actions—avoiding overlap like separate `read_databases` or  `read_csvs`. Instead, import CSVs into the database.
1126 | 
1127 | ## Example Good Action Design
1128 | 
1129 | - **Incremental:** Feed content in manageable chunks (500 lines or 1 page) instead of all at once.
1130 | 
1131 | - **Overview-zoom-in:** First provide high-level structure (table of contents, summary), then allow drilling into details (raw texts).
1132 | 
1133 | - **Parameterized/Programmable:** Instead of fixed actions, enable parameterized (columns to select) or programmable (SQL queries) actions, for example, to read CSV files.
1134 | 
1135 | - **Backtracking:** Let the agent undo the last step instead of restarting entirely, preserving progress when encountering errors or dead ends.
1136 | 
1137 | ## Example: Search Agent
1138 | 
1139 | This agent:
1140 | 1. Decides whether to search or answer
1141 | 2. If searches, loops back to decide if more search needed
1142 | 3. Answers when enough context gathered
1143 | 
1144 | ```python
1145 | class DecideAction(Node):
1146 |     def prep(self, shared):
1147 |         context = shared.get("context", "No previous search")
1148 |         query = shared["query"]
1149 |         return query, context
1150 |         
1151 |     def exec(self, inputs):
1152 |         query, context = inputs
1153 |         prompt = f"""
1154 | Given input: {query}
1155 | Previous search results: {context}
1156 | Should I: 1) Search web for more info 2) Answer with current knowledge
1157 | Output in yaml:
1158 | ```yaml
1159 | action: search/answer
1160 | reason: why this action
1161 | search_term: search phrase if action is search
1162 | ```"""
1163 |         resp = call_llm(prompt)
1164 |         yaml_str = resp.split("```yaml")[1].split("```")[0].strip()
1165 |         result = yaml.safe_load(yaml_str)
1166 |         
1167 |         assert isinstance(result, dict)
1168 |         assert "action" in result
1169 |         assert "reason" in result
1170 |         assert result["action"] in ["search", "answer"]
1171 |         if result["action"] == "search":
1172 |             assert "search_term" in result
1173 |         
1174 |         return result
1175 | 
1176 |     def post(self, shared, prep_res, exec_res):
1177 |         if exec_res["action"] == "search":
1178 |             shared["search_term"] = exec_res["search_term"]
1179 |         return exec_res["action"]
1180 | 
1181 | class SearchWeb(Node):
1182 |     def prep(self, shared):
1183 |         return shared["search_term"]
1184 |         
1185 |     def exec(self, search_term):
1186 |         return search_web(search_term)
1187 |     
1188 |     def post(self, shared, prep_res, exec_res):
1189 |         prev_searches = shared.get("context", [])
1190 |         shared["context"] = prev_searches + [
1191 |             {"term": shared["search_term"], "result": exec_res}
1192 |         ]
1193 |         return "decide"
1194 |         
1195 | class DirectAnswer(Node):
1196 |     def prep(self, shared):
1197 |         return shared["query"], shared.get("context", "")
1198 |         
1199 |     def exec(self, inputs):
1200 |         query, context = inputs
1201 |         return call_llm(f"Context: {context}\nAnswer: {query}")
1202 | 
1203 |     def post(self, shared, prep_res, exec_res):
1204 |        print(f"Answer: {exec_res}")
1205 |        shared["answer"] = exec_res
1206 | 
1207 | # Connect nodes
1208 | decide = DecideAction()
1209 | search = SearchWeb()
1210 | answer = DirectAnswer()
1211 | 
1212 | decide - "search" >> search
1213 | decide - "answer" >> answer
1214 | search - "decide" >> decide  # Loop back
1215 | 
1216 | flow = Flow(start=decide)
1217 | flow.run({"query": "Who won the Nobel Prize in Physics 2024?"})
1218 | ```
1219 | 
1220 | ================================================
1221 | File: docs/design_pattern/mapreduce.md
1222 | ================================================
1223 | ---
1224 | layout: default
1225 | title: "Map Reduce"
1226 | parent: "Design Pattern"
1227 | nav_order: 4
1228 | ---
1229 | 
1230 | # Map Reduce
1231 | 
1232 | MapReduce is a design pattern suitable when you have either:
1233 | - Large input data (e.g., multiple files to process), or
1234 | - Large output data (e.g., multiple forms to fill)
1235 | 
1236 | and there is a logical way to break the task into smaller, ideally independent parts. 
1237 | 
1238 | <div align="center">
1239 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/mapreduce.png?raw=true" width="400"/>
1240 | </div>
1241 | 
1242 | You first break down the task using [BatchNode](../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase.
1243 | 
1244 | ### Example: Document Summarization
1245 | 
1246 | ```python
1247 | class SummarizeAllFiles(BatchNode):
1248 |     def prep(self, shared):
1249 |         files_dict = shared["files"]  # e.g. 10 files
1250 |         return list(files_dict.items())  # [("file1.txt", "aaa..."), ("file2.txt", "bbb..."), ...]
1251 | 
1252 |     def exec(self, one_file):
1253 |         filename, file_content = one_file
1254 |         summary_text = call_llm(f"Summarize the following file:\n{file_content}")
1255 |         return (filename, summary_text)
1256 | 
1257 |     def post(self, shared, prep_res, exec_res_list):
1258 |         shared["file_summaries"] = dict(exec_res_list)
1259 | 
1260 | class CombineSummaries(Node):
1261 |     def prep(self, shared):
1262 |         return shared["file_summaries"]
1263 | 
1264 |     def exec(self, file_summaries):
1265 |         # format as: "File1: summary\nFile2: summary...\n"
1266 |         text_list = []
1267 |         for fname, summ in file_summaries.items():
1268 |             text_list.append(f"{fname} summary:\n{summ}\n")
1269 |         big_text = "\n---\n".join(text_list)
1270 | 
1271 |         return call_llm(f"Combine these file summaries into one final summary:\n{big_text}")
1272 | 
1273 |     def post(self, shared, prep_res, final_summary):
1274 |         shared["all_files_summary"] = final_summary
1275 | 
1276 | batch_node = SummarizeAllFiles()
1277 | combine_node = CombineSummaries()
1278 | batch_node >> combine_node
1279 | 
1280 | flow = Flow(start=batch_node)
1281 | 
1282 | shared = {
1283 |     "files": {
1284 |         "file1.txt": "Alice was beginning to get very tired of sitting by her sister...",
1285 |         "file2.txt": "Some other interesting text ...",
1286 |         # ...
1287 |     }
1288 | }
1289 | flow.run(shared)
1290 | print("Individual Summaries:", shared["file_summaries"])
1291 | print("\nFinal Summary:\n", shared["all_files_summary"])
1292 | ```
1293 | 
1294 | ================================================
1295 | File: docs/design_pattern/rag.md
1296 | ================================================
1297 | ---
1298 | layout: default
1299 | title: "RAG"
1300 | parent: "Design Pattern"
1301 | nav_order: 3
1302 | ---
1303 | 
1304 | # RAG (Retrieval Augmented Generation)
1305 | 
1306 | For certain LLM tasks like answering questions, providing relevant context is essential. One common architecture is a **two-stage** RAG pipeline:
1307 | 
1308 | <div align="center">
1309 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/rag.png?raw=true" width="400"/>
1310 | </div>
1311 | 
1312 | 1. **Offline stage**: Preprocess and index documents ("building the index").
1313 | 2. **Online stage**: Given a question, generate answers by retrieving the most relevant context.
1314 | 
1315 | ---
1316 | ## Stage 1: Offline Indexing
1317 | 
1318 | We create three Nodes:
1319 | 1. `ChunkDocs` – [chunks](../utility_function/chunking.md) raw text.
1320 | 2. `EmbedDocs` – [embeds](../utility_function/embedding.md) each chunk.
1321 | 3. `StoreIndex` – stores embeddings into a [vector database](../utility_function/vector.md).
1322 | 
1323 | ```python
1324 | class ChunkDocs(BatchNode):
1325 |     def prep(self, shared):
1326 |         # A list of file paths in shared["files"]. We process each file.
1327 |         return shared["files"]
1328 | 
1329 |     def exec(self, filepath):
1330 |         # read file content. In real usage, do error handling.
1331 |         with open(filepath, "r", encoding="utf-8") as f:
1332 |             text = f.read()
1333 |         # chunk by 100 chars each
1334 |         chunks = []
1335 |         size = 100
1336 |         for i in range(0, len(text), size):
1337 |             chunks.append(text[i : i + size])
1338 |         return chunks
1339 |     
1340 |     def post(self, shared, prep_res, exec_res_list):
1341 |         # exec_res_list is a list of chunk-lists, one per file.
1342 |         # flatten them all into a single list of chunks.
1343 |         all_chunks = []
1344 |         for chunk_list in exec_res_list:
1345 |             all_chunks.extend(chunk_list)
1346 |         shared["all_chunks"] = all_chunks
1347 | 
1348 | class EmbedDocs(BatchNode):
1349 |     def prep(self, shared):
1350 |         return shared["all_chunks"]
1351 | 
1352 |     def exec(self, chunk):
1353 |         return get_embedding(chunk)
1354 | 
1355 |     def post(self, shared, prep_res, exec_res_list):
1356 |         # Store the list of embeddings.
1357 |         shared["all_embeds"] = exec_res_list
1358 |         print(f"Total embeddings: {len(exec_res_list)}")
1359 | 
1360 | class StoreIndex(Node):
1361 |     def prep(self, shared):
1362 |         # We'll read all embeds from shared.
1363 |         return shared["all_embeds"]
1364 | 
1365 |     def exec(self, all_embeds):
1366 |         # Create a vector index (faiss or other DB in real usage).
1367 |         index = create_index(all_embeds)
1368 |         return index
1369 | 
1370 |     def post(self, shared, prep_res, index):
1371 |         shared["index"] = index
1372 | 
1373 | # Wire them in sequence
1374 | chunk_node = ChunkDocs()
1375 | embed_node = EmbedDocs()
1376 | store_node = StoreIndex()
1377 | 
1378 | chunk_node >> embed_node >> store_node
1379 | 
1380 | OfflineFlow = Flow(start=chunk_node)
1381 | ```
1382 | 
1383 | Usage example:
1384 | 
1385 | ```python
1386 | shared = {
1387 |     "files": ["doc1.txt", "doc2.txt"],  # any text files
1388 | }
1389 | OfflineFlow.run(shared)
1390 | ```
1391 | 
1392 | ---
1393 | ## Stage 2: Online Query & Answer
1394 | 
1395 | We have 3 nodes:
1396 | 1. `EmbedQuery` – embeds the user’s question.
1397 | 2. `RetrieveDocs` – retrieves top chunk from the index.
1398 | 3. `GenerateAnswer` – calls the LLM with the question + chunk to produce the final answer.
1399 | 
1400 | ```python
1401 | class EmbedQuery(Node):
1402 |     def prep(self, shared):
1403 |         return shared["question"]
1404 | 
1405 |     def exec(self, question):
1406 |         return get_embedding(question)
1407 | 
1408 |     def post(self, shared, prep_res, q_emb):
1409 |         shared["q_emb"] = q_emb
1410 | 
1411 | class RetrieveDocs(Node):
1412 |     def prep(self, shared):
1413 |         # We'll need the query embedding, plus the offline index/chunks
1414 |         return shared["q_emb"], shared["index"], shared["all_chunks"]
1415 | 
1416 |     def exec(self, inputs):
1417 |         q_emb, index, chunks = inputs
1418 |         I, D = search_index(index, q_emb, top_k=1)
1419 |         best_id = I[0][0]
1420 |         relevant_chunk = chunks[best_id]
1421 |         return relevant_chunk
1422 | 
1423 |     def post(self, shared, prep_res, relevant_chunk):
1424 |         shared["retrieved_chunk"] = relevant_chunk
1425 |         print("Retrieved chunk:", relevant_chunk[:60], "...")
1426 | 
1427 | class GenerateAnswer(Node):
1428 |     def prep(self, shared):
1429 |         return shared["question"], shared["retrieved_chunk"]
1430 | 
1431 |     def exec(self, inputs):
1432 |         question, chunk = inputs
1433 |         prompt = f"Question: {question}\nContext: {chunk}\nAnswer:"
1434 |         return call_llm(prompt)
1435 | 
1436 |     def post(self, shared, prep_res, answer):
1437 |         shared["answer"] = answer
1438 |         print("Answer:", answer)
1439 | 
1440 | embed_qnode = EmbedQuery()
1441 | retrieve_node = RetrieveDocs()
1442 | generate_node = GenerateAnswer()
1443 | 
1444 | embed_qnode >> retrieve_node >> generate_node
1445 | OnlineFlow = Flow(start=embed_qnode)
1446 | ```
1447 | 
1448 | Usage example:
1449 | 
1450 | ```python
1451 | # Suppose we already ran OfflineFlow and have:
1452 | # shared["all_chunks"], shared["index"], etc.
1453 | shared["question"] = "Why do people like cats?"
1454 | 
1455 | OnlineFlow.run(shared)
1456 | # final answer in shared["answer"]
1457 | ```
1458 | 
1459 | ================================================
1460 | File: docs/design_pattern/structure.md
1461 | ================================================
1462 | ---
1463 | layout: default
1464 | title: "Structured Output"
1465 | parent: "Design Pattern"
1466 | nav_order: 5
1467 | ---
1468 | 
1469 | # Structured Output
1470 | 
1471 | In many use cases, you may want the LLM to output a specific structure, such as a list or a dictionary with predefined keys.
1472 | 
1473 | There are several approaches to achieve a structured output:
1474 | - **Prompting** the LLM to strictly return a defined structure.
1475 | - Using LLMs that natively support **schema enforcement**.
1476 | - **Post-processing** the LLM's response to extract structured content.
1477 | 
1478 | In practice, **Prompting** is simple and reliable for modern LLMs.
1479 | 
1480 | ### Example Use Cases
1481 | 
1482 | - Extracting Key Information 
1483 | 
1484 | ```yaml
1485 | product:
1486 |   name: Widget Pro
1487 |   price: 199.99
1488 |   description: |
1489 |     A high-quality widget designed for professionals.
1490 |     Recommended for advanced users.
1491 | ```
1492 | 
1493 | - Summarizing Documents into Bullet Points
1494 | 
1495 | ```yaml
1496 | summary:
1497 |   - This product is easy to use.
1498 |   - It is cost-effective.
1499 |   - Suitable for all skill levels.
1500 | ```
1501 | 
1502 | - Generating Configuration Files
1503 | 
1504 | ```yaml
1505 | server:
1506 |   host: 127.0.0.1
1507 |   port: 8080
1508 |   ssl: true
1509 | ```
1510 | 
1511 | ## Prompt Engineering
1512 | 
1513 | When prompting the LLM to produce **structured** output:
1514 | 1. **Wrap** the structure in code fences (e.g., `yaml`).
1515 | 2. **Validate** that all required fields exist (and let `Node` handles retry).
1516 | 
1517 | ### Example Text Summarization
1518 | 
1519 | ```python
1520 | class SummarizeNode(Node):
1521 |     def exec(self, prep_res):
1522 |         # Suppose `prep_res` is the text to summarize.
1523 |         prompt = f"""
1524 | Please summarize the following text as YAML, with exactly 3 bullet points
1525 | 
1526 | {prep_res}
1527 | 
1528 | Now, output:
1529 | ```yaml
1530 | summary:
1531 |   - bullet 1
1532 |   - bullet 2
1533 |   - bullet 3
1534 | ```"""
1535 |         response = call_llm(prompt)
1536 |         yaml_str = response.split("```yaml")[1].split("```")[0].strip()
1537 | 
1538 |         import yaml
1539 |         structured_result = yaml.safe_load(yaml_str)
1540 | 
1541 |         assert "summary" in structured_result
1542 |         assert isinstance(structured_result["summary"], list)
1543 | 
1544 |         return structured_result
1545 | ```
1546 | 
1547 | > Besides using `assert` statements, another popular way to validate schemas is [Pydantic](https://github.com/pydantic/pydantic)
1548 | {: .note }
1549 | 
1550 | ### Why YAML instead of JSON?
1551 | 
1552 | Current LLMs struggle with escaping. YAML is easier with strings since they don't always need quotes.
1553 | 
1554 | **In JSON**  
1555 | 
1556 | ```json
1557 | {
1558 |   "dialogue": "Alice said: \"Hello Bob.\\nHow are you?\\nI am good.\""
1559 | }
1560 | ```
1561 | 
1562 | - Every double quote inside the string must be escaped with `\"`.
1563 | - Each newline in the dialogue must be represented as `\n`.
1564 | 
1565 | **In YAML**  
1566 | 
1567 | ```yaml
1568 | dialogue: |
1569 |   Alice said: "Hello Bob.
1570 |   How are you?
1571 |   I am good."
1572 | ```
1573 | 
1574 | - No need to escape interior quotes—just place the entire text under a block literal (`|`).
1575 | - Newlines are naturally preserved without needing `\n`.
1576 | 
1577 | ================================================
1578 | File: docs/design_pattern/workflow.md
1579 | ================================================
1580 | ---
1581 | layout: default
1582 | title: "Workflow"
1583 | parent: "Design Pattern"
1584 | nav_order: 2
1585 | ---
1586 | 
1587 | # Workflow
1588 | 
1589 | Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](../core_abstraction/flow.md) of multiple Nodes.
1590 | 
1591 | <div align="center">
1592 |   <img src="https://github.com/the-pocket/.github/raw/main/assets/workflow.png?raw=true" width="400"/>
1593 | </div>
1594 | 
1595 | > - You don't want to make each task **too coarse**, because it may be *too complex for one LLM call*.
1596 | > - You don't want to make each task **too granular**, because then *the LLM call doesn't have enough context* and results are *not consistent across nodes*.
1597 | > 
1598 | > You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](./agent.md).
1599 | {: .best-practice }
1600 | 
1601 | ### Example: Article Writing
1602 | 
1603 | ```python
1604 | class GenerateOutline(Node):
1605 |     def prep(self, shared): return shared["topic"]
1606 |     def exec(self, topic): return call_llm(f"Create a detailed outline for an article about {topic}")
1607 |     def post(self, shared, prep_res, exec_res): shared["outline"] = exec_res
1608 | 
1609 | class WriteSection(Node):
1610 |     def prep(self, shared): return shared["outline"]
1611 |     def exec(self, outline): return call_llm(f"Write content based on this outline: {outline}")
1612 |     def post(self, shared, prep_res, exec_res): shared["draft"] = exec_res
1613 | 
1614 | class ReviewAndRefine(Node):
1615 |     def prep(self, shared): return shared["draft"]
1616 |     def exec(self, draft): return call_llm(f"Review and improve this draft: {draft}")
1617 |     def post(self, shared, prep_res, exec_res): shared["final_article"] = exec_res
1618 | 
1619 | # Connect nodes
1620 | outline = GenerateOutline()
1621 | write = WriteSection()
1622 | review = ReviewAndRefine()
1623 | 
1624 | outline >> write >> review
1625 | 
1626 | # Create and run flow
1627 | writing_flow = Flow(start=outline)
1628 | shared = {"topic": "AI Safety"}
1629 | writing_flow.run(shared)
1630 | ```
1631 | 
1632 | For *dynamic cases*, consider using [Agents](./agent.md).
1633 | 
1634 | ================================================
1635 | File: docs/utility_function/llm.md
1636 | ================================================
1637 | ---
1638 | layout: default
1639 | title: "LLM Wrapper"
1640 | parent: "Utility Function"
1641 | nav_order: 1
1642 | ---
1643 | 
1644 | # LLM Wrappers
1645 | 
1646 | Check out libraries like [litellm](https://github.com/BerriAI/litellm). 
1647 | Here, we provide some minimal example implementations:
1648 | 
1649 | 1. OpenAI
1650 |     ```python
1651 |     def call_llm(prompt):
1652 |         from openai import OpenAI
1653 |         client = OpenAI(api_key="YOUR_API_KEY_HERE")
1654 |         r = client.chat.completions.create(
1655 |             model="gpt-4o",
1656 |             messages=[{"role": "user", "content": prompt}]
1657 |         )
1658 |         return r.choices[0].message.content
1659 | 
1660 |     # Example usage
1661 |     call_llm("How are you?")
1662 |     ```
1663 |     > Store the API key in an environment variable like OPENAI_API_KEY for security.
1664 |     {: .best-practice }
1665 | 
1666 | 2. Claude (Anthropic)
1667 |     ```python
1668 |     def call_llm(prompt):
1669 |         from anthropic import Anthropic
1670 |         client = Anthropic(api_key="YOUR_API_KEY_HERE")
1671 |         r = client.messages.create(
1672 |             model="claude-sonnet-4-0",
1673 |             messages=[
1674 |                 {"role": "user", "content": prompt}
1675 |             ]
1676 |         )
1677 |         return r.content[0].text
1678 |     ```
1679 | 
1680 | 3. Google (Generative AI Studio / PaLM API)
1681 |     ```python
1682 |     def call_llm(prompt):
1683 |     from google import genai
1684 |     client = genai.Client(api_key='GEMINI_API_KEY')
1685 |         response = client.models.generate_content(
1686 |         model='gemini-2.5-pro',
1687 |         contents=prompt
1688 |     )
1689 |     return response.text
1690 |     ```
1691 | 
1692 | 4. Azure (Azure OpenAI)
1693 |     ```python
1694 |     def call_llm(prompt):
1695 |         from openai import AzureOpenAI
1696 |         client = AzureOpenAI(
1697 |             azure_endpoint="https://<YOUR_RESOURCE_NAME>.openai.azure.com/",
1698 |             api_key="YOUR_API_KEY_HERE",
1699 |             api_version="2023-05-15"
1700 |         )
1701 |         r = client.chat.completions.create(
1702 |             model="<YOUR_DEPLOYMENT_NAME>",
1703 |             messages=[{"role": "user", "content": prompt}]
1704 |         )
1705 |         return r.choices[0].message.content
1706 |     ```
1707 | 
1708 | 5. Ollama (Local LLM)
1709 |     ```python
1710 |     def call_llm(prompt):
1711 |         from ollama import chat
1712 |         response = chat(
1713 |             model="llama2",
1714 |             messages=[{"role": "user", "content": prompt}]
1715 |         )
1716 |         return response.message.content
1717 |     ```
1718 | 
1719 | ## Improvements
1720 | Feel free to enhance your `call_llm` function as needed. Here are examples:
1721 | 
1722 | - Handle chat history:
1723 | 
1724 | ```python
1725 | def call_llm(messages):
1726 |     from openai import OpenAI
1727 |     client = OpenAI(api_key="YOUR_API_KEY_HERE")
1728 |     r = client.chat.completions.create(
1729 |         model="gpt-4o",
1730 |         messages=messages
1731 |     )
1732 |     return r.choices[0].message.content
1733 | ```
1734 | 
1735 | - Add in-memory caching 
1736 | 
1737 | ```python
1738 | from functools import lru_cache
1739 | 
1740 | @lru_cache(maxsize=1000)
1741 | def call_llm(prompt):
1742 |     # Your implementation here
1743 |     pass
1744 | ```
1745 | 
1746 | > ⚠️ Caching conflicts with Node retries, as retries yield the same result.
1747 | >
1748 | > To address this, you could use cached results only if not retried.
1749 | {: .warning }
1750 | 
1751 | 
1752 | ```python
1753 | from functools import lru_cache
1754 | 
1755 | @lru_cache(maxsize=1000)
1756 | def cached_call(prompt):
1757 |     pass
1758 | 
1759 | def call_llm(prompt, use_cache):
1760 |     if use_cache:
1761 |         return cached_call(prompt)
1762 |     # Call the underlying function directly
1763 |     return cached_call.__wrapped__(prompt)
1764 | 
1765 | class SummarizeNode(Node):
1766 |     def exec(self, text):
1767 |         return call_llm(f"Summarize: {text}", self.cur_retry==0)
1768 | ```
1769 | 
1770 | - Enable logging:
1771 | 
1772 | ```python
1773 | def call_llm(prompt):
1774 |     import logging
1775 |     logging.info(f"Prompt: {prompt}")
1776 |     response = ... # Your implementation here
1777 |     logging.info(f"Response: {response}")
1778 |     return response
1779 | ```


--------------------------------------------------------------------------------