├── utils ├── __init__.py └── call_llm.py ├── requirements.txt ├── assets └── banner.png ├── flow.py ├── .gitignore ├── main.py ├── README.md ├── docs └── design.md ├── data_profiling_report.md ├── test └── patients.csv ├── nodes.py ├── .clinerules └── .cursorrules /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pocketflow>=0.0.1 2 | pandas>=2.0.0 3 | PyYAML>=6.0 4 | openai>=1.0.0 5 | google-genai -------------------------------------------------------------------------------- /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-Pocket/PocketFlow-Tutorial-Data-Profiler/main/assets/banner.png -------------------------------------------------------------------------------- /utils/call_llm.py: -------------------------------------------------------------------------------- 1 | from google import genai 2 | import os 3 | 4 | def call_llm(prompt: str) -> str: 5 | """ 6 | Call Google Gemini LLM with the given prompt. 7 | 8 | Args: 9 | prompt (str): The prompt to send to the LLM 10 | 11 | Returns: 12 | str: The response from the LLM 13 | """ 14 | api_key = os.getenv("GEMINI_API_KEY", "Your API Key") 15 | client = genai.Client(api_key=api_key) 16 | model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro") 17 | 18 | response = client.models.generate_content( 19 | model=model, 20 | contents=[prompt] 21 | ) 22 | return response.text 23 | 24 | if __name__ == "__main__": 25 | test_prompt = "Hello, how are you?" 26 | 27 | print("Making call...") 28 | response = call_llm(test_prompt) 29 | print(f"Response: {response}") -------------------------------------------------------------------------------- /flow.py: -------------------------------------------------------------------------------- 1 | from pocketflow import Flow 2 | from nodes import ( 3 | DuplicateDetectionNode, 4 | TableSummaryNode, 5 | ColumnDescriptionNode, 6 | DataTypeAnalysisNode, 7 | MissingValuesAnalysisNode, 8 | UniquenessAnalysisNode, 9 | UnusualValuesDetectionNode, 10 | GenerateReportNode 11 | ) 12 | 13 | def create_data_profiling_flow(): 14 | """Create and return a data profiling flow.""" 15 | 16 | # Create all nodes 17 | duplicate_node = DuplicateDetectionNode() 18 | summary_node = TableSummaryNode() 19 | column_desc_node = ColumnDescriptionNode() 20 | data_type_node = DataTypeAnalysisNode() 21 | missing_values_node = MissingValuesAnalysisNode() 22 | uniqueness_node = UniquenessAnalysisNode() 23 | unusual_values_node = UnusualValuesDetectionNode() 24 | report_node = GenerateReportNode() 25 | 26 | # Connect nodes in sequence (following the workflow design) 27 | duplicate_node >> summary_node >> column_desc_node >> data_type_node >> missing_values_node >> uniqueness_node >> unusual_values_node >> report_node 28 | 29 | # Create flow starting with duplicate detection 30 | return Flow(start=duplicate_node) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | vendor/ 4 | .pnp/ 5 | .pnp.js 6 | 7 | # Build outputs 8 | dist/ 9 | build/ 10 | out/ 11 | *.pyc 12 | __pycache__/ 13 | 14 | # Environment files 15 | .env 16 | .env.local 17 | .env.*.local 18 | .env.development 19 | .env.test 20 | .env.production 21 | 22 | # IDE - VSCode 23 | .vscode/* 24 | !.vscode/settings.json 25 | !.vscode/tasks.json 26 | !.vscode/launch.json 27 | !.vscode/extensions.json 28 | 29 | # IDE - JetBrains 30 | .idea/ 31 | *.iml 32 | *.iws 33 | *.ipr 34 | 35 | # IDE - Eclipse 36 | .project 37 | .classpath 38 | .settings/ 39 | 40 | # Logs 41 | logs/ 42 | *.log 43 | npm-debug.log* 44 | yarn-debug.log* 45 | yarn-error.log* 46 | 47 | # Operating System 48 | .DS_Store 49 | Thumbs.db 50 | *.swp 51 | *.swo 52 | 53 | # Testing 54 | coverage/ 55 | .nyc_output/ 56 | 57 | # Temporary files 58 | *.tmp 59 | *.temp 60 | .cache/ 61 | 62 | # Compiled files 63 | *.com 64 | *.class 65 | *.dll 66 | *.exe 67 | *.o 68 | *.so 69 | 70 | # Package files 71 | *.7z 72 | *.dmg 73 | *.gz 74 | *.iso 75 | *.jar 76 | *.rar 77 | *.tar 78 | *.zip 79 | 80 | # Database 81 | *.sqlite 82 | *.sqlite3 83 | *.db 84 | 85 | # Optional npm cache directory 86 | .npm 87 | 88 | # Optional eslint cache 89 | .eslintcache 90 | 91 | # Optional REPL history 92 | .node_repl_history -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from flow import create_data_profiling_flow 3 | 4 | def main(): 5 | """Main function for data profiling""" 6 | 7 | # Load the test dataset 8 | print("Loading patient data...") 9 | df = pd.read_csv("test/patients.csv") 10 | print(f"Loaded {len(df)} rows and {len(df.columns)} columns") 11 | 12 | # Initialize shared store with the data profiling structure 13 | shared = { 14 | "dataframe": df, 15 | "sample_data": "", 16 | "profile_results": { 17 | "duplicates": {}, 18 | "table_summary": "", 19 | "column_descriptions": {}, 20 | "data_types": {}, 21 | "missing_values": {}, 22 | "uniqueness": {}, 23 | "unusual_values": {} 24 | }, 25 | "final_report": "" 26 | } 27 | 28 | # Create and run the data profiling flow 29 | print("\nStarting data profiling analysis...") 30 | profiling_flow = create_data_profiling_flow() 31 | profiling_flow.run(shared) 32 | 33 | # Save the report first (avoid console encoding issues) 34 | with open("data_profiling_report.md", "w", encoding="utf-8") as f: 35 | f.write(shared["final_report"]) 36 | print("\nReport saved to: data_profiling_report.md") 37 | print(f"Report contains {len(shared['final_report'])} characters") 38 | 39 | # Show basic stats instead of full report 40 | print("\n" + "="*50 + " SUMMARY " + "="*50) 41 | dup = shared["profile_results"]["duplicates"] 42 | print(f"✓ Analyzed {dup['total_rows']} rows, {len(shared['dataframe'].columns)} columns") 43 | print(f"✓ Found {dup['count']} duplicate rows ({dup['percentage']:.1f}%)") 44 | print(f"✓ Analysis complete - check data_profiling_report.md for full details") 45 | print("="*108) 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PocketFlow Data Profiling Tool 2 | 3 | An intelligent data profiling tool powered by LLMs that provides deep, contextual analysis of your datasets beyond traditional statistical metrics. 4 | 5 | ## 🎯 What This Tool Does 6 | 7 | This tool performs comprehensive data profiling through a 7-step workflow: 8 | 9 | 1. **Duplicate Detection** - Identifies and analyzes duplicate rows with recommendations 10 | 2. **Table Summary** - Generates high-level description of what your data represents 11 | 3. **Column Descriptions** - Analyzes each column with meaningful descriptions and naming suggestions 12 | 4. **Data Type Analysis** - Recommends optimal data types for each column 13 | 5. **Missing Values Analysis** - Categorizes missing values as meaningful vs problematic 14 | 6. **Uniqueness Analysis** - Identifies potential unique identifier columns 15 | 7. **Unusual Values Detection** - Detects outliers, anomalies, and data quality issues 16 | 17 | ## 🚀 How to Run 18 | 19 | ### Prerequisites 20 | 21 | 1. **Install dependencies:** 22 | ```bash 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | 2. **Set up your LLM:** 27 | 28 | The tool uses OpenAI by default. Set your API key: 29 | ```bash 30 | export OPENAI_API_KEY="your-key-here" 31 | ``` 32 | 33 | To use your own LLM or different providers, check out the [PocketFlow LLM documentation](https://the-pocket.github.io/PocketFlow/utility_function/llm.html) and modify `utils/call_llm.py` accordingly. 34 | 35 | **Test your LLM setup:** 36 | ```bash 37 | python utils/call_llm.py 38 | ``` 39 | 40 | ### Running the Tool 41 | 42 | ```bash 43 | python main.py 44 | ``` 45 | 46 | By default, it analyzes the sample patient dataset in `test/patients.csv`. To analyze your own data, modify `main.py`: 47 | 48 | ```python 49 | # Replace this line: 50 | df = pd.read_csv("test/patients.csv") 51 | 52 | # With your data: 53 | df = pd.read_csv("path/to/your/data.csv") 54 | ``` 55 | 56 | ### Output 57 | 58 | The tool generates: 59 | - **Console summary** with key statistics 60 | - **Markdown report** saved as `data_profiling_report.md` with comprehensive analysis 61 | 62 | ## 📊 Example Results 63 | 64 | From the sample patient dataset (60 rows, 27 columns): 65 | 66 | - ✅ Detected invalid SSN formats (test data with "999" prefix) 67 | - ✅ Identified name contamination (numeric suffixes in names) 68 | - ✅ Found meaningful missing patterns (83% missing death dates = living patients) 69 | - ✅ Recommended data type conversions (dates to datetime64, categories for demographics) 70 | - ✅ Identified unique identifiers (UUID primary key, SSN) 71 | 72 | ## 🏗️ Architecture 73 | 74 | Built with [PocketFlow](https://github.com/The-Pocket/PocketFlow) - a minimalist LLM framework: 75 | 76 | - **Workflow pattern** for sequential processing pipeline 77 | - **BatchNode** for efficient parallel column analysis 78 | - **YAML-based** structured outputs with validation 79 | - **Intelligent LLM analysis** for contextual understanding 80 | 81 | ## 📁 Project Structure 82 | 83 | ``` 84 | ├── main.py # Entry point 85 | ├── flow.py # Flow orchestrator 86 | ├── nodes.py # All profiling nodes 87 | ├── utils/ 88 | │ └── call_llm.py # LLM utility (customize for your provider) 89 | ├── test/ 90 | │ └── patients.csv # Sample dataset 91 | └── docs/ 92 | └── design.md # Design documentation 93 | ``` 94 | 95 | ## 🔧 Customization 96 | 97 | ### Using Different LLM Providers 98 | 99 | Edit `utils/call_llm.py` to use your preferred LLM: 100 | - Claude (Anthropic) 101 | - Google Gemini 102 | - Azure OpenAI 103 | - Local models (Ollama) 104 | 105 | See the [PocketFlow LLM guide](https://the-pocket.github.io/PocketFlow/utility_function/llm.html) for examples. 106 | 107 | ### Analyzing Different Data Types 108 | 109 | The tool works with any pandas DataFrame. You can: 110 | - Load from CSV, Excel, JSON, Parquet 111 | - Connect to databases 112 | - Use API data 113 | 114 | Just ensure your data is loaded as a pandas DataFrame before running the flow. 115 | 116 | ## 🎓 Tutorial 117 | 118 | This project demonstrates **Agentic Coding** with [PocketFlow](https://github.com/The-Pocket/PocketFlow). Want to learn more? 119 | 120 | - Check out the [Agentic Coding Guidance](https://the-pocket.github.io/PocketFlow/guide.html) 121 | - Watch the [YouTube Tutorial](https://www.youtube.com/@ZacharyLLM?sub_confirmation=1) 122 | 123 | ## 📝 License 124 | 125 | This project is a tutorial example for PocketFlow. 126 | -------------------------------------------------------------------------------- /docs/design.md: -------------------------------------------------------------------------------- 1 | # Design Doc: Data Profiling Tool 2 | 3 | > Please DON'T remove notes for AI 4 | 5 | ## Requirements 6 | 7 | > Notes for AI: Keep it simple and clear. 8 | > If the requirements are abstract, write concrete user stories 9 | 10 | **Problem**: Users need to understand their pandas DataFrame data quality and characteristics before analysis or modeling. 11 | 12 | **User Stories**: 13 | - As a data scientist, I want to automatically detect duplicate rows so I can decide whether to remove them 14 | - As an analyst, I want a high-level summary of my table to understand what the data represents 15 | - As a data engineer, I want detailed column descriptions to understand each field's meaning 16 | - As a developer, I want to identify correct data types for proper processing 17 | - As a researcher, I want to find missing values and understand if they're meaningful or problematic 18 | - As a quality analyst, I want to identify unique columns that could serve as identifiers 19 | - As a data validator, I want to detect unusual/outlier values that may indicate data quality issues 20 | 21 | ## Flow Design 22 | 23 | > Notes for AI: 24 | > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit. 25 | > 2. Present a concise, high-level description of the workflow. 26 | 27 | ### Applicable Design Pattern: 28 | 29 | 1. **Workflow**: Sequential processing pipeline where each step builds upon previous analysis 30 | 2. **Batch**: Some nodes (like column analysis) process multiple columns in parallel for efficiency 31 | 32 | ### Flow High-level Design: 33 | 34 | 1. **Duplicate Detection Node**: Analyzes the DataFrame for duplicate rows and provides statistics 35 | 2. **Table Summary Node**: Creates a high-level description of what the table represents 36 | 3. **Column Description Node**: Analyzes each column to provide meaningful descriptions and suggest better names 37 | 4. **Data Type Analysis Node**: Determines appropriate data types for each column 38 | 5. **Missing Values Analysis Node**: Identifies missing values and categorizes them as meaningful vs problematic 39 | 6. **Uniqueness Analysis Node**: Identifies columns that could serve as unique identifiers 40 | 7. **Unusual Values Detection Node**: Detects outliers and anomalous values in each column 41 | 42 | ```mermaid 43 | flowchart TD 44 | start[Start: Load DataFrame] --> duplicate[Duplicate Detection] 45 | duplicate --> summary[Table Summary] 46 | summary --> columns[Column Descriptions] 47 | columns --> datatypes[Data Type Analysis] 48 | datatypes --> missing[Missing Values Analysis] 49 | missing --> unique[Uniqueness Analysis] 50 | unique --> unusual[Unusual Values Detection] 51 | unusual --> report[Generate Final Report] 52 | ``` 53 | 54 | ## Utility Functions 55 | 56 | > Notes for AI: 57 | > 1. Understand the utility function definition thoroughly by reviewing the doc. 58 | > 2. Include only the necessary utility functions, based on nodes in the flow. 59 | 60 | 1. **Call LLM** (`utils/call_llm.py`) 61 | - *Input*: prompt (str) 62 | - *Output*: response (str) 63 | - Used by all analysis nodes for intelligent data interpretation 64 | 65 | ## Node Design 66 | 67 | ### Shared Store 68 | 69 | > Notes for AI: Try to minimize data redundancy 70 | 71 | The shared store structure is organized as follows: 72 | 73 | ```python 74 | shared = { 75 | "dataframe": pd.DataFrame, # Original DataFrame 76 | "sample_data": str, # CSV sample for LLM analysis 77 | "profile_results": { 78 | "duplicates": { 79 | "count": int, 80 | "percentage": float, 81 | "sample_rows": str 82 | }, 83 | "table_summary": str, 84 | "column_descriptions": { 85 | "col_name": { 86 | "description": str, 87 | "suggested_name": str 88 | } 89 | }, 90 | "data_types": { 91 | "col_name": { 92 | "current_type": str, 93 | "suggested_type": str, 94 | "confidence": float 95 | } 96 | }, 97 | "missing_values": { 98 | "col_name": { 99 | "count": int, 100 | "percentage": float, 101 | "likely_meaningful": bool, 102 | "reason": str 103 | } 104 | }, 105 | "uniqueness": { 106 | "col_name": { 107 | "unique_count": int, 108 | "unique_percentage": float, 109 | "is_candidate_key": bool 110 | } 111 | }, 112 | "unusual_values": { 113 | "col_name": { 114 | "has_unusual": bool, 115 | "unusual_samples": list, 116 | "explanation": str 117 | } 118 | } 119 | }, 120 | "final_report": str # Comprehensive profiling report 121 | } 122 | ``` 123 | 124 | ### Node Steps 125 | 126 | > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow. 127 | 128 | 1. **Duplicate Detection Node** 129 | - *Purpose*: Detect and analyze duplicate rows in the DataFrame 130 | - *Type*: Regular Node 131 | - *Steps*: 132 | - *prep*: Read "dataframe" from shared store and create sample 133 | - *exec*: Call LLM to analyze duplicate patterns and significance 134 | - *post*: Write duplicate analysis to "profile_results.duplicates" 135 | 136 | 2. **Table Summary Node** 137 | - *Purpose*: Generate high-level description of the table's purpose and content 138 | - *Type*: Regular Node 139 | - *Steps*: 140 | - *prep*: Read "dataframe" sample and column names from shared store 141 | - *exec*: Call LLM to generate comprehensive table summary 142 | - *post*: Write summary to "profile_results.table_summary" 143 | 144 | 3. **Column Description Node** 145 | - *Purpose*: Analyze each column to provide descriptions and name suggestions 146 | - *Type*: Batch Node (processes columns in chunks) 147 | - *Steps*: 148 | - *prep*: Return list of column chunks for parallel processing 149 | - *exec*: Call LLM to analyze each column chunk for descriptions 150 | - *post*: Combine results and write to "profile_results.column_descriptions" 151 | 152 | 4. **Data Type Analysis Node** 153 | - *Purpose*: Determine appropriate data types for each column 154 | - *Type*: Regular Node 155 | - *Steps*: 156 | - *prep*: Read "dataframe" and column info from shared store 157 | - *exec*: Call LLM to analyze data types with sample data 158 | - *post*: Write type analysis to "profile_results.data_types" 159 | 160 | 5. **Missing Values Analysis Node** 161 | - *Purpose*: Analyze missing values to determine if they're meaningful or problematic 162 | - *Type*: Regular Node 163 | - *Steps*: 164 | - *prep*: Read "dataframe" and calculate missing value statistics 165 | - *exec*: Call LLM to determine if missing values are meaningful 166 | - *post*: Write missing value analysis to "profile_results.missing_values" 167 | 168 | 6. **Uniqueness Analysis Node** 169 | - *Purpose*: Identify columns that could serve as unique identifiers 170 | - *Type*: Regular Node 171 | - *Steps*: 172 | - *prep*: Read "dataframe" and calculate uniqueness statistics 173 | - *exec*: Call LLM to determine candidate key columns 174 | - *post*: Write uniqueness analysis to "profile_results.uniqueness" 175 | 176 | 7. **Unusual Values Detection Node** 177 | - *Purpose*: Detect outliers and anomalous values in columns 178 | - *Type*: Batch Node (processes columns individually) 179 | - *Steps*: 180 | - *prep*: Return list of columns to analyze for unusual values 181 | - *exec*: Call LLM to analyze each column's value patterns 182 | - *post*: Write unusual value findings to "profile_results.unusual_values" 183 | 184 | -------------------------------------------------------------------------------- /data_profiling_report.md: -------------------------------------------------------------------------------- 1 | # Data Profiling Report 2 | 3 | ## Table Summary 4 | This table represents a collection of detailed personal records for individuals. Each person is identified by an **Id**, and may also have an **SSN**, **DRIVERS** license, or **PASSPORT** number. Their full name is detailed with **PREFIX**, **FIRST**, **LAST**, **SUFFIX**, and a **MAIDEN** name if applicable. 5 | 6 | The records include vital and demographic information such as **BIRTHDATE**, **DEATHDATE**, **MARITAL** status, **RACE**, **ETHNICITY**, and **GENDER**. Geographic information specifies the person's **BIRTHPLACE** and their current residential **ADDRESS**, **CITY**, **STATE**, **COUNTY**, **FIPS** code, **ZIP**, and geographic coordinates (**LAT**, **LON**). Finally, the table contains financial information related to an individual's **HEALTHCARE_EXPENSES**, **HEALTHCARE_COVERAGE**, and **INCOME**. 7 | 8 | ## Duplicate Analysis 9 | - **Total rows**: 60 10 | - **Duplicate rows**: 0 (0.00%) 11 | - **Should remove**: False 12 | - **Analysis**: No duplicate rows found in the dataset. 13 | 14 | ## Column Descriptions 15 | - **Id** → *person_id*: A unique identifier for each record, formatted as a UUID (Universally Unique Identifier). 16 | - **BIRTHDATE** → *birth_date*: The person's date of birth in YYYY-MM-DD format. 17 | - **DEATHDATE** → *death_date*: The person's date of death in YYYY-MM-DD format. This field is empty if the person is alive. 18 | - **SSN** → *social_security_number*: The person's 9-digit Social Security Number, formatted as XXX-XX-XXXX. 19 | - **DRIVERS** → *drivers_license_number*: The person's driver's license number. 20 | - **PASSPORT** → *passport_number*: The person's passport number. 21 | - **PREFIX** → *name_prefix*: A title or honorific that precedes a person's name (e.g., 'Mr.', 'Mrs.', 'Dr.'). 22 | - **FIRST** → *first_name*: The person's first or given name. 23 | - **LAST** → *last_name*: The person's last or family name. 24 | - **SUFFIX** → *name_suffix*: A suffix that follows a person's full name (e.g., 'Jr.', 'Sr.', 'III'). 25 | - **MAIDEN** → *maiden_name*: The individual's last name at birth, often used for married individuals who have changed their name. Appears to have null values for those it does not apply to. 26 | - **MARITAL** → *marital_status*: The individual's marital status. The sample data uses 'M' likely for 'Married'. 27 | - **RACE** → *race*: The individual's self-identified race. 28 | - **ETHNICITY** → *ethnicity*: The individual's self-identified ethnicity, primarily indicating Hispanic or Non-Hispanic origin. 29 | - **GENDER** → *gender*: The individual's gender, represented by 'M' for Male and 'F' for Female. 30 | - **BIRTHPLACE** → *birth_place*: The location where the individual was born, as a single string containing city, state, and country. 31 | - **ADDRESS** → *street_address*: The street address of the individual's residence, including building number, street name, and unit/apartment number. 32 | - **CITY** → *city*: The city of the individual's residential address. 33 | - **STATE** → *state*: The state of the individual's residential address. 34 | - **COUNTY** → *county*: The county of the individual's residential address. 35 | - **FIPS** → *fips_code*: A FIPS (Federal Information Processing Standard) code, likely identifying a US county. 36 | - **ZIP** → *zip_code*: The 5-digit US postal ZIP code for the location. 37 | - **LAT** → *latitude*: The geographic latitude coordinate for the location. 38 | - **LON** → *longitude*: The geographic longitude coordinate for the location. 39 | - **HEALTHCARE_EXPENSES** → *healthcare_expenses_usd*: A monetary value representing healthcare-related expenses, likely per capita or household, in USD. 40 | - **HEALTHCARE_COVERAGE** → *healthcare_coverage_value_usd*: A monetary value related to healthcare coverage, possibly representing total premiums or insured value in the area. 41 | - **INCOME** → *median_income_usd*: A monetary value representing the average or median income for the area, likely in USD. 42 | 43 | ## Data Type Analysis 44 | - **BIRTHDATE**: object → *datetime64* (The column contains date values in a standard 'YYYY-MM-DD' format.) 45 | - **DEATHDATE**: object → *datetime64* (The column contains date values and empty strings, which can be represented as dates and Not a Time (NaT) values.) 46 | - **PREFIX**: object → *category* (The column has a small number of repeated string values (e.g., 'Mr.', 'Mrs.', 'Ms.'), making it ideal for the memory-efficient category type.) 47 | - **SUFFIX**: object → *category* (This column likely contains a small, fixed set of name suffixes (e.g. 'Jr.', 'Sr.'), making it suitable for the category type.) 48 | - **MARITAL**: object → *category* (The column represents marital status and likely has a small number of distinct values ('M', 'S', etc.), making it ideal for the category type.) 49 | - **RACE**: object → *category* (The column contains a small, well-defined set of values for race, which is a classic categorical variable.) 50 | - **ETHNICITY**: object → *category* (The column contains a small, well-defined set of values for ethnicity, making it a categorical variable.) 51 | - **GENDER**: object → *category* (The column has a very small number of distinct values ('M', 'F'), making it a prime candidate for the category type.) 52 | - **CITY**: object → *category* (The number of unique city names is much smaller than the total number of records, making 'category' a memory-efficient choice.) 53 | - **STATE**: object → *category* (The number of unique states is very small and fixed, making this an ideal categorical variable.) 54 | - **COUNTY**: object → *category* (The number of unique counties is finite and much smaller than the number of records, making 'category' a memory-efficient choice.) 55 | - **FIPS**: float64 → *category* (FIPS codes are categorical identifiers for geographic locations. Using 'category' is memory efficient and semantically correct as they are not used for mathematical operations.) 56 | - **ZIP**: int64 → *category* (ZIP codes are geographic identifiers. While numeric, they are not used for calculations. Using 'category' is memory-efficient and avoids issues with leading zeros.) 57 | 58 | ## Missing Values Analysis 59 | **Overview**: The dataset exhibits both meaningful and problematic missingness. Fields like DEATHDATE, SUFFIX, and MAIDEN have high percentages of missing values that are expected and informative, indicating a specific status (e.g., 'alive' or 'not applicable'). Conversely, fields like MARITAL and FIPS have missing values that represent genuine data quality gaps, hindering demographic and geographic analysis. 60 | 61 | ### Problematic Missing Values 62 | - **PREFIX**: 10 missing (16.7%) - Prefixes (Mr., Ms., etc.) are often optional fields. While their absence is common, it represents incomplete data rather than a specific status, making it a minor data quality issue. 63 | - **MARITAL**: 20 missing (33.3%) - Marital status is a core demographic attribute. A 33.3% missing rate is a significant data quality problem, as the absence does not imply a default status (like 'single') and creates gaps in analysis. 64 | - **FIPS**: 14 missing (23.3%) - FIPS is a standardized geographic code for a county. Since COUNTY data exists, the FIPS code should be derivable. Its absence is a data processing or quality issue that hinders standardized geographic analysis. 65 | 66 | ### Likely Meaningful Missing Values 67 | - **DEATHDATE**: 50 missing (83.3%) - The high percentage of missing values (83.3%) strongly suggests that a blank DEATHDATE indicates the person is still alive. The absence of data is the data. 68 | - **DRIVERS**: 6 missing (10.0%) - A missing driver's license number likely means the person does not have one, which could be due to age (minors) or personal choice. It is not necessarily an error. 69 | - **PASSPORT**: 13 missing (21.7%) - Similar to a driver's license, not every individual has a passport. A missing value indicates the person likely does not possess one. 70 | - **SUFFIX**: 56 missing (93.3%) - Name suffixes (Jr., III, etc.) are rare. The very high percentage of missing values (93.3%) correctly reflects that most people do not have one. 71 | - **MAIDEN**: 49 missing (81.7%) - A maiden name is only applicable to a subset of the population (typically, married individuals who changed their name). A blank value is expected for males, unmarried individuals, or those who kept their original name. 72 | 73 | ## Uniqueness Analysis 74 | ### Candidate Key Columns 75 | - **Id**: This column is a system-generated unique identifier (like a UUID) for each record. The table context states it identifies each person, and the data analysis confirms it is 100% unique. It's designed specifically to be a primary key. 76 | - **SSN**: A Social Security Number is a government-issued number intended to be a unique identifier for each person in the United States. It is 100% unique in the sample data and is a strong candidate for a natural key, despite its sensitive nature. 77 | 78 | ### Highly Unique Columns 79 | - **BIRTHDATE**: 83.3% unique 80 | - **DRIVERS**: 90.0% unique 81 | - **PASSPORT**: 78.3% unique 82 | - **FIRST**: 98.3% unique 83 | - **LAST**: 88.3% unique 84 | - **BIRTHPLACE**: 80.0% unique 85 | - **ADDRESS**: 100.0% unique 86 | - **CITY**: 66.7% unique 87 | - **ZIP**: 60.0% unique 88 | - **LAT**: 100.0% unique 89 | - **LON**: 100.0% unique 90 | - **HEALTHCARE_EXPENSES**: 100.0% unique 91 | - **HEALTHCARE_COVERAGE**: 98.3% unique 92 | - **INCOME**: 83.3% unique 93 | 94 | ## Unusual Values Detection 95 | - **SSN**: All sample values begin with the area number '999'. The Social Security Administration (SSA) does not issue SSNs with area numbers (the first three digits) in the 900-999 range. These values are invalid and likely represent dummy or placeholder data. 96 | - **FIRST**: The column 'FIRST' is expected to contain first names. However, all sample values are a mix of text and numbers (e.g., 'Mel236', 'Cheyenne169'). This suggests that names have been concatenated with a numeric ID or code, which is unusual for a standard first name field. 97 | - **LAST**: The values in the 'LAST' column consistently follow a pattern of a name followed by a three-digit number (e.g., 'Bailey598'). This is unusual because a column named 'LAST' is expected to contain only the last name. The presence of appended numbers suggests a potential data quality issue where a name and a numeric ID have been merged into a single field. 98 | - **MAIDEN**: The values in the 'MAIDEN' column consistently follow a pattern of a name followed by a sequence of numbers (e.g., 'Lowe577'). A column representing a maiden name would typically contain only alphabetic characters. The presence of appended numbers is unusual and suggests the column may be a concatenation of a name and a numeric identifier. 99 | - **FIPS**: The values appear to be valid 5-digit county FIPS codes. However, they are stored as floats (float64) instead of strings. FIPS codes are identifiers, not numerical quantities, and should be stored as strings to prevent issues like the loss of leading zeros (e.g., '01001' becoming 1001.0) and to reflect their categorical nature. The trailing '.0' in each sample is an artifact of this incorrect data type. 100 | - **ZIP**: The value '0' is present, which is not a valid ZIP or postal code. This value likely represents missing data, a default entry, or an error during data conversion. 101 | - **HEALTHCARE_COVERAGE**: The presence of `0.0` is unusual. It's ambiguous whether this represents a valid state (no coverage) or is a placeholder for missing data. Additionally, the data has a very wide range, and the value `1777031.06` is a potential high-end outlier, being significantly larger than the other sample values. 102 | -------------------------------------------------------------------------------- /test/patients.csv: -------------------------------------------------------------------------------- 1 | Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME 2 | eb247227-e839-88d3-447d-b5972468f33b,2021-09-23,,999-41-1756,,,,Mel236,Bailey598,,,,white,nonhispanic,M,Norton Center Massachusetts US,716 Wunsch Gardens Unit 48,Framingham,Massachusetts,Middlesex County,25017,01701,42.27565048847629,-71.4763670033942,2520.80,4323.64,170754 3 | 2ffa361e-5858-877e-e022-ce81fe32da1b,1944-05-31,,999-33-4589,S99957814,X45639058X,Mrs.,Cheyenne169,Marks830,,Lowe577,M,white,nonhispanic,F,Longmeadow Massachusetts US,123 Bayer Camp,Taunton,Massachusetts,Bristol County,25005,02718,41.89288420730215,-71.06668598167076,205342.20,94647.00,40526 4 | 3dfb065a-67df-5b8a-3901-49bfd834bed1,2009-02-08,,999-59-2568,,,,Hunter736,Keebler762,,,,white,nonhispanic,M,Maynard Massachusetts US,575 Jast Rue Unit 48,Winchendon,Massachusetts,Worcester County,25027,01475,42.670059014687666,-72.07466425723803,16381.92,17447.87,79884 5 | db80575b-5e9b-921b-fad9-1e3a20929dc7,1979-06-26,1995-07-04,999-77-7700,S99968506,,,Herschel574,Ernser583,,,,asian,nonhispanic,M,Somerville Massachusetts US,184 Langworth Parade Apt 10,Boston,Massachusetts,Suffolk County,25025,02131,42.39551626795498,-71.05901494925675,3850.00,44057.32,6420 6 | d84815a3-c5b3-8ca2-025f-6323a4ec59ef,1973-05-31,,999-29-2359,S99967405,X86891718X,Mrs.,Lacey714,Heathcote539,,Hegmann834,M,white,nonhispanic,F,Natick Massachusetts US,801 Morissette Divide,Hingham,Massachusetts,Plymouth County,25023,02043,42.20072325055452,-70.83659045847199,66662.10,1777031.06,933420 7 | 7ec76836-c039-d9bf-8bb9-fe488c66d452,2003-01-13,,999-42-9847,S99998925,,Ms.,Adelia946,Collier206,,,,white,nonhispanic,F,Marshfield Massachusetts US,459 Larson Union,Boston,Massachusetts,Suffolk County,25025,02134,42.31550631209828,-71.05169551644717,4050.00,158604.59,1361 8 | 79297a39-2d2d-d88d-5e47-7a521af1d69f,1998-12-14,,999-49-9846,S99945605,X69843358X,Mr.,Hayden835,Casper496,,,,white,nonhispanic,M,Charlton Massachusetts US,589 Conroy Approach,Belmont,Massachusetts,Middlesex County,25017,02472,42.36153882121102,-71.20913616208074,40347.11,180076.32,51861 9 | 734e5f3c-e660-6cbe-7c26-c5264cbde68e,2005-03-03,,999-71-8314,S99991875,,,Herb645,Willms744,,,,white,hispanic,M,Melrose Massachusetts US,980 Koss Plaza Apt 11,Brockton,Massachusetts,Plymouth County,25023,02302,42.11165412918098,-71.0259065985567,390568.25,0.00,35002 10 | 750cdaf4-c264-e967-e76b-53a5a61abcab,1983-02-18,,999-95-3792,S99957390,X63804957X,Mr.,Stewart672,Schimmel440,,,M,white,nonhispanic,M,Fall River Massachusetts US,843 Yost Spur Unit 81,Sharon,Massachusetts,Norfolk County,25021,02067,42.10147261542774,-71.2054748347118,7321.10,176776.70,10335 11 | 285cba54-c91d-6db4-4d78-1ea35ba6b622,1998-10-30,,999-44-2795,S99942670,X49037240X,Ms.,Jenae263,Becker968,,,,white,nonhispanic,F,Somerville Massachusetts US,248 Ernser Terrace Suite 86,Lynn,Massachusetts,Essex County,25009,01901,42.49850442782566,-71.03582388708702,34690.56,626729.36,56421 12 | 064ef124-22ef-af09-1940-0fec6c3574bc,1972-05-01,,999-29-7349,S99945886,X86223344X,Ms.,Andera917,Lemke654,,,S,white,nonhispanic,F,Kingston Massachusetts US,606 Price View Unit 89,Boston,Massachusetts,Suffolk County,25025,02116,42.428281838517776,-71.03071500996144,591573.83,346350.22,28232 13 | df6bcea7-a0c7-6ed0-e9e4-fd1dc33b76f7,1965-08-03,,999-38-7473,S99953338,X64982272X,Mrs.,Linn541,Gislason620,,Hermann103,M,white,nonhispanic,F,Westwood Massachusetts US,275 Tromp Burg Suite 54,Erving,Massachusetts,Franklin County,,00000,42.6366197794778,-72.38328626727915,54801.69,228252.18,83088 14 | cfa94700-7440-d5f7-516a-bae08cb365a7,2022-08-14,,999-20-5403,,,,Kaycee352,Koss676,,,,white,nonhispanic,F,New Bedford Massachusetts US,859 Hansen Mission Apt 56,Montague,Massachusetts,Franklin County,,00000,42.590026050563246,-72.52614825581455,1644.00,1310.64,147152 15 | 52f8df2b-25a8-fbba-af75-0e11f3a054d4,2000-10-21,,999-16-4297,S99952319,X27281996X,Ms.,Cindy893,Lueilwitz711,,,,asian,nonhispanic,F,Hanoi Hà Đông VN,892 Haag Gateway Unit 67,Boston,Massachusetts,Suffolk County,25025,02120,42.31347958243134,-71.1029639299087,8065.00,917055.49,3758 16 | bc1efffb-0983-081f-d4c4-3345f6f2abbd,2009-05-16,,999-43-4282,,,,Huey641,Schumm995,,,,black,nonhispanic,M,Needham Massachusetts US,835 Powlowski Junction Suite 1,Danvers,Massachusetts,Essex County,25009,01923,42.60647530229309,-71.02623584083403,13076.83,5093.56,74906 17 | b3b71304-fe5b-bda4-6822-bd901b2836d1,1962-05-14,,999-48-5926,S99948203,X32413718X,Mr.,Antony83,Armstrong51,,,M,white,nonhispanic,M,Fall River Massachusetts US,830 Dare Park Apt 34,Marshfield,Massachusetts,Plymouth County,25023,02050,42.162629807392534,-70.73981579334742,41148.04,632431.62,35673 18 | fd0b726d-b7e6-976d-7cda-8679dd849610,1965-01-01,,999-96-6743,S99949277,X14857165X,Ms.,Daniela614,Rico947,,,S,white,hispanic,F,Bayamon Puerto Rico PR,279 Grady Estate,Boston,Massachusetts,Suffolk County,25025,02120,42.32851940354027,-71.03026624879821,54097.09,478630.93,93689 19 | 53534989-404e-cc7c-2859-1708edba296c,1959-05-28,,999-14-9672,S99998623,X86168167X,Mr.,Fletcher87,O'Conner199,,,S,white,nonhispanic,M,Wellesley Massachusetts US,873 Ledner Hollow Unit 28,Gardner,Massachusetts,Worcester County,25027,01440,42.56780989156895,-72.00057697137188,329359.12,160957.56,40192 20 | fca2a21e-3319-131a-7e84-ff984b871e16,1979-06-26,,999-29-4844,S99938158,X13579933X,Mr.,Kirk871,Nolan344,,,S,asian,nonhispanic,M,Billerica Massachusetts US,356 Wintheiser Passage,Boston,Massachusetts,Suffolk County,25025,02109,42.382825730736656,-71.060338397059,12300.34,775237.82,6420 21 | d89557ad-d741-8ea5-b542-c1226a781d83,1963-12-09,,999-22-2635,S99975865,X35559645X,Mr.,Steve819,Brakus656,,,M,white,nonhispanic,M,Quincy Massachusetts US,949 Langworth Light Apt 7,Yarmouth,Massachusetts,Barnstable County,,00000,41.67994080784203,-70.22659782831724,517636.61,2883.47,51527 22 | c8fbb10b-b54e-8182-d71c-c552bd1c58b1,1976-04-03,,999-50-5697,S99983337,X69114774X,Mr.,Hayden835,Schumm995,,,S,white,nonhispanic,M,Braintree Massachusetts US,856 Gusikowski Lane,North Adams,Massachusetts,Berkshire County,25003,01247,42.63685002791058,-73.08831197031574,190214.57,27591.19,36603 23 | 1d13ebc3-0635-059a-5fe9-82c92ede84ec,2006-05-08,,999-79-3695,S99923330,,,Moises22,O'Conner199,,,,white,hispanic,M,Saugus Massachusetts US,459 Cassin Forge Suite 9,Baldwinville,Massachusetts,Worcester County,25027,01436,42.603196202455976,-72.08703841216905,20035.70,11871.97,69174 24 | 0cd0df97-9d92-5d95-fbde-6b0a7e6af1c8,1932-07-09,1941-07-07,999-55-6098,,,,Coleman27,Kreiger457,,,,white,nonhispanic,M,Boston Massachusetts US,785 Ankunding Drive,Scituate,Massachusetts,Plymouth County,25023,02066,42.18599323192734,-70.79923081726402,9206.41,38661.38,870606 25 | fa451eba-6815-0d99-fd91-02f5581d914b,1946-11-12,,999-79-2426,S99940159,X53572828X,Ms.,Lillia547,Nolan344,,,S,white,nonhispanic,F,Chelsea Massachusetts US,287 Medhurst Bypass,Saugus,Massachusetts,Essex County,25009,01906,42.42536923970955,-71.00203089714843,64688.84,1193878.90,99091 26 | 92bc26b1-c317-7db6-492e-3c8ea452b36d,2005-04-22,,999-57-7157,S99990370,,,Vanesa40,Anderson154,,,,white,nonhispanic,F,Lawrence Massachusetts US,990 Hyatt Gateway,Chicopee,Massachusetts,Hampden County,25013,01013,42.20443967658899,-72.59684936418238,23076.68,14653.34,27706 27 | 8cfec0eb-f022-f332-55f5-38a2c35f5b84,2002-08-14,,999-93-7263,S99928764,X60540972X,Ms.,Mirta419,Hayes766,,,,white,nonhispanic,F,Boston Massachusetts US,737 Hauck Estate,Holliston,Massachusetts,Middlesex County,,00000,42.19537880557404,-71.40405413382206,11458.37,763952.25,17389 28 | 27948426-0f88-0e3b-dd6b-8bd9d8512892,1991-04-23,,999-99-2106,S99915210,X21909602X,Ms.,Shiela18,Jenkins714,,,S,white,nonhispanic,F,Methuen Massachusetts US,931 Lowe Route,Boston,Massachusetts,Suffolk County,25025,02111,42.366094173491376,-71.0766434184029,37489.68,507808.67,172784 29 | 548300bb-3152-531c-d895-d44fbf2ff1ba,1972-10-14,,999-17-2611,S99989489,X58425716X,Mrs.,Christal240,Hoppe518,,Gorczany269,M,white,nonhispanic,F,Boston Massachusetts US,298 Ryan Corner Suite 66,Sandwich,Massachusetts,Barnstable County,25001,02563,41.710640404859575,-70.46063787198887,53359.37,719097.64,91140 30 | 821960e1-9db8-7b56-a359-ac34d9228fbc,1960-04-26,,999-61-6611,S99976250,X16809119X,Mrs.,Guadalupe206,Bermúdez789,,Barela183,M,white,hispanic,F,Ponce Puerto Rico PR,336 Nienow Course,Tyngsborough,Massachusetts,Middlesex County,,00000,42.66681617186233,-71.47460669911267,67147.48,794967.24,64210 31 | 100881d9-7b59-8060-2772-f41b276970fc,1985-01-06,,999-89-9127,S99998550,X39158329X,Mrs.,Corey514,Johnson679,,Beier427,M,white,nonhispanic,F,Norton Massachusetts US,412 Spinka Plaza,Quincy,Massachusetts,Norfolk County,25021,02171,42.2627318928807,-71.01242779795183,41414.26,543793.88,129073 32 | d4db8ae1-3354-d064-508d-834cfa214cb2,1958-07-12,,999-35-8448,S99991458,X15138167X,Mr.,Gayle448,MacGyver246,,,M,white,nonhispanic,M,Framingham Massachusetts US,138 Hilll Well,Yarmouth,Massachusetts,Barnstable County,,00000,41.679741071089055,-70.17797861520093,79882.99,32342.26,21813 33 | d7f0610d-ec4d-fb9e-2d0d-b54e3ad621fb,1978-12-22,,999-88-1792,S99963549,X35369763X,Mr.,Whitney250,Hamill307,,,M,white,hispanic,M,Walpole Massachusetts US,416 Oberbrunner Dam Apt 95,Worcester,Massachusetts,Worcester County,25027,01603,42.29434089325724,-71.80056312559488,36814.41,239.57,44580 34 | e8323f7c-6829-3bce-0621-63d588b2e901,2010-09-21,,999-32-4862,,,,Dione665,Wilkinson796,,,,white,nonhispanic,F,Grafton Massachusetts US,455 Rutherford Lock,Lawrence,Massachusetts,Essex County,25009,01841,42.65392733597755,-71.1397130605323,13474.46,27265.47,64612 35 | 2995bf9b-5760-2099-77fe-ba01250cec42,1953-05-07,,999-10-1178,S99915523,X21910865X,Ms.,Shayla126,Rath779,,,S,white,nonhispanic,F,Fitchburg Massachusetts US,106 Hane Skyway Suite 0,Hampden,Massachusetts,Hampden County,,00000,42.07986301784488,-72.43670689724918,16242.76,800689.76,15098 36 | b4ea2bfe-cd6b-92a6-ff78-d2e995243894,1932-07-09,1966-11-11,999-70-2405,S99928700,X87718021X,Mr.,Damon455,Kshlerin58,JD,,S,white,nonhispanic,M,Cambridge Massachusetts US,803 Powlowski Park,Scituate,Massachusetts,Plymouth County,25023,02066,42.16374680818888,-70.80708512636961,23445.48,18839.51,870606 37 | ee98453d-79ed-910b-2e4a-9b32d9350fb6,1991-03-13,,999-70-4594,S99922993,X63601019X,Mr.,Luke971,Trantow673,,,M,asian,nonhispanic,M,Haiphong Kiến An VN,653 Jones Run Suite 14,Leominster,Massachusetts,Worcester County,25027,01453,42.55158257952831,-71.7700198666129,24117.54,26060.37,139930 38 | 98308074-8188-7b69-a1d1-be735cdc3ff4,1997-08-01,,999-14-9380,S99982480,X30896463X,Ms.,Carrol931,Rutherford999,,,,white,nonhispanic,F,Malden Massachusetts US,401 Reichel Route Suite 47,Pittsfield,Massachusetts,Berkshire County,25003,01201,42.414512991574455,-73.30118644516655,17732.85,963178.27,143644 39 | 2333e462-582c-9c83-d382-4c5e0c2c1ad0,2000-09-29,,999-58-7543,S99923245,X14074343X,Ms.,Lavette209,Zboncak558,,,,black,nonhispanic,F,Medfield Massachusetts US,994 Feest Crossroad Apt 13,Marblehead,Massachusetts,Essex County,25009,01945,42.49472255145156,-70.81702859735168,31189.37,595880.21,129433 40 | 931c7fd6-6330-1008-cef4-df84dd836d15,2002-10-04,,999-26-4422,S99938533,X25071442X,Mr.,Burton124,Stehr398,,,,white,nonhispanic,M,Taunton Massachusetts US,619 Upton Landing Apt 9,Chelmsford,Massachusetts,Middlesex County,,00000,42.548797580393405,-71.3540262724428,25994.44,467794.53,185360 41 | b3f13b30-5802-e5f3-685b-36c3c09283f1,2003-03-10,,999-20-4271,S99992852,,Ms.,Lessie363,Langworth352,,,,white,nonhispanic,F,Lynnfield Massachusetts US,745 Koelpin Trailer,Westborough,Massachusetts,Worcester County,25027,01581,42.30108671835689,-71.57704229893056,5139.91,88026.45,9657 42 | 4860e9a0-1263-6ca4-fe42-b7a73cbeec16,1947-12-14,2011-08-10,999-37-8682,S99927544,X81110019X,Mr.,Dominick530,Mills423,,,M,white,nonhispanic,M,Lynn Massachusetts US,1025 Spinka Overpass Suite 19,Wilbraham,Massachusetts,Hampden County,25013,01095,42.11528427077081,-72.45768605820261,58407.08,40127.28,326248 43 | 87424a5e-7848-aed5-fd59-4c8a76c2ed36,1965-08-08,,999-36-1150,S99945201,X58343864X,Mrs.,Zoila41,McGlynn426,,DuBuque211,M,white,nonhispanic,F,Brookline Massachusetts US,867 Langosh Grove Apt 84,Fairhaven,Massachusetts,Bristol County,,00000,41.63250541278397,-70.87313339013353,15950.00,1425035.79,15144 44 | 26a90721-54f3-b755-ecf4-a8aab978c01c,1963-02-27,,999-55-3195,S99981647,X12176676X,Mr.,Martín25,Roldán470,,,M,white,hispanic,M,Buenos Aires Ciudad de Buenos Aires AR,382 Satterfield Annex Suite 45,Ludlow,Massachusetts,Hampden County,,00000,42.219739387326875,-72.45861669907123,829967.44,0.00,49667 45 | 06d7ef99-093b-fe84-6d7c-52a3eab126fe,1955-04-04,,999-99-2436,S99953324,X10127197X,Ms.,Yetta429,Doyle959,,,S,white,nonhispanic,F,Grafton Massachusetts US,762 Senger Lodge,New Bedford,Massachusetts,Bristol County,25005,02740,41.62800607094596,-70.98717778570096,1079260.67,33271.04,49980 46 | 74d4ca38-9f05-2212-c539-44139fdd8ab4,2003-08-20,,999-44-9634,S99977402,,Mr.,Elden718,Collins926,,,,white,nonhispanic,M,Salem Massachusetts US,343 Reynolds Lock Unit 95,Cambridge,Massachusetts,Middlesex County,25017,02141,42.41937500378478,-71.10280714780403,7303.71,44500.70,16133 47 | f56c230b-3a7c-aca2-6363-fa3d46cf6596,1985-02-05,,999-51-3221,S99937081,X44375219X,Mrs.,Shaquana156,MacGyver246,,Deckow585,M,black,nonhispanic,F,Brockton Massachusetts US,282 Wintheiser Quay Suite 46,Sharon,Massachusetts,Norfolk County,25021,02067,42.07048110807395,-71.20906020387682,45950.72,726541.39,397408 48 | 37279a07-035d-e18b-bcd7-331dc3fe6304,1975-08-17,,999-91-9580,S99984964,X59848524X,Mr.,Jimmie93,Graham902,,,M,asian,nonhispanic,M,Beijing Beijing Municipality CN,269 Jones Estate Apt 87,Lynn,Massachusetts,Essex County,25009,01901,42.540185973577316,-70.95005850290717,50890.22,105568.63,58204 49 | 2d5a8517-f25f-9f66-9ab8-0a69425145c7,1960-04-25,,999-62-7937,S99914972,X40059543X,Mr.,Manuel446,Quitzon246,,,M,white,nonhispanic,M,New Bedford Massachusetts US,828 Hahn Ferry Suite 62,Raynham,Massachusetts,Bristol County,,00000,41.93105766594213,-71.08785486712216,54306.03,143291.33,77316 50 | 1666a800-a041-a2ca-4f9b-af668e740370,1963-06-05,,999-30-8851,S99915586,X37464865X,Mr.,Grant908,Hahn503,,,M,white,nonhispanic,M,Boston Massachusetts US,168 Ernser Viaduct Apt 32,Cambridge,Massachusetts,Middlesex County,25017,02141,42.33909824190683,-71.11622581495207,51477.06,68767.89,224975 51 | 8762040e-69bb-6ac8-685c-1d63a3d4dfe2,1960-02-25,,999-10-6028,S99958879,X62869090X,Ms.,Carmelita854,Hagenes547,,,S,white,nonhispanic,F,Taunton Massachusetts US,900 McClure Fort,Salem,Massachusetts,Essex County,25009,01970,42.52904093176596,-70.86167782746726,16750.00,1561478.75,13488 52 | 48334e94-64e7-91e8-d91b-1246110bf1ba,1932-07-09,2004-12-07,999-23-9351,S99916924,X14993625X,Mr.,Leandro563,Hane680,JD,,M,white,nonhispanic,M,Duxbury Massachusetts US,166 Jerde Avenue,Scituate,Massachusetts,Plymouth County,25023,02066,42.25402035749961,-70.7386488899729,62366.99,201314.86,870606 53 | a68f0fcf-424b-d8dd-3949-a69f0f3f9979,1935-09-06,2020-07-09,999-12-9121,S99987192,X75313923X,Mrs.,Janeth814,Feest103,,Shanahan202,M,white,nonhispanic,F,Boxford Massachusetts US,175 Mayer Frontage road Apt 63,East Longmeadow,Massachusetts,Hampden County,,00000,42.04366398532018,-72.53842235808679,909196.90,370234.71,29277 54 | 4f233603-d38e-fec1-7106-b6a09c62f28e,1947-12-14,2012-02-06,999-61-6740,S99975126,X69639917X,Mr.,Raymon366,Orn563,,,M,white,nonhispanic,M,East Bridgewater Massachusetts US,144 Waters Bypass Apt 13,Wilbraham,Massachusetts,Hampden County,25013,01095,42.1070174210795,-72.46946143213026,61483.39,148955.95,326248 55 | 561bc09a-56b9-859e-b926-fc66685d9df1,1946-03-21,,999-73-5643,S99995586,X13967823X,Mr.,Tyrell880,Schimmel440,,,M,white,nonhispanic,M,Georgetown Massachusetts US,102 Waters Estate Unit 93,New Bedford,Massachusetts,Bristol County,25005,02743,41.76092573197442,-70.93429596770599,69632.06,271187.57,62948 56 | 0c603e3d-ff1b-936d-14aa-9e875fa47cad,1932-07-09,2012-05-11,999-85-2178,S99970498,X20385492X,Mr.,Irving123,Hamill307,PhD,,M,white,nonhispanic,M,Kingston Massachusetts US,880 Bauch Lodge,Scituate,Massachusetts,Plymouth County,25023,02066,42.1894755210766,-70.72442915485766,67284.69,105065.45,870606 57 | da069417-667e-3b7e-8730-b00bf5dbcd7f,1935-09-06,2015-02-25,999-50-5586,S99935886,X71918603X,Mrs.,Ethel888,Corwin846,,Predovic534,M,white,nonhispanic,F,New Bedford Massachusetts US,344 Miller Street,East Longmeadow,Massachusetts,Hampden County,,00000,42.06043518424488,-72.46281354520333,724339.28,171776.51,29277 58 | dd19ae3a-2f3a-a636-f2bf-f3fe51e3ff7d,1947-12-14,2011-07-21,999-61-4140,S99946445,X63265370X,Mr.,Cedrick207,Cruickshank494,,,M,white,nonhispanic,M,Stoneham Massachusetts US,1051 Brakus Center Unit 33,Wilbraham,Massachusetts,Hampden County,25013,01095,42.153689936905664,-72.46563770143045,45887.05,1111058.21,326248 59 | 86636875-af39-df1b-edd7-209e8ffb77d2,1932-07-09,,999-26-8041,S99949411,X80854357X,Mr.,Garry927,Nikolaus26,MD,,M,white,nonhispanic,M,Rockland Massachusetts US,609 Paucek Skyway,Scituate,Massachusetts,Plymouth County,25023,02066,42.21737593221302,-70.71331268909627,53754.99,77450.36,870606 60 | 5dbb559c-7f46-f0bb-58ad-9069d1cce9b4,1935-09-06,,999-23-4696,S99949724,X57424902X,Mrs.,Hildred696,Casper496,,Kohler843,M,white,nonhispanic,F,Malden Massachusetts US,638 Bradtke Hollow,East Longmeadow,Massachusetts,Hampden County,,00000,42.04728057003359,-72.49659986178744,983859.74,213860.48,29277 61 | 9cfb6988-97da-cc56-58d4-81be46378f43,1947-12-14,,999-56-9201,S99974442,X17350166X,Mr.,Lonny638,Klein929,,,M,white,nonhispanic,M,Maynard Massachusetts US,848 Kuphal Junction,Wilbraham,Massachusetts,Hampden County,25013,01095,42.13813414111617,-72.45483019554929,71706.16,392055.44,326248 62 | -------------------------------------------------------------------------------- /nodes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import yaml 3 | from pocketflow import Node, BatchNode 4 | from utils.call_llm import call_llm 5 | 6 | def truncate_cell(value, max_length=50): 7 | """Truncate cell values for display purposes""" 8 | if pd.isna(value): 9 | return value 10 | str_value = str(value) 11 | if len(str_value) > max_length: 12 | return str_value[:max_length] + "..." 13 | return str_value 14 | 15 | 16 | class DuplicateDetectionNode(Node): 17 | def prep(self, shared): 18 | df = shared["dataframe"] 19 | 20 | # Find duplicate rows 21 | duplicate_rows = df[df.duplicated(keep=False)] 22 | duplicate_count = len(duplicate_rows) - len(duplicate_rows.drop_duplicates()) 23 | duplicate_percentage = (duplicate_count / len(df)) * 100 if len(df) > 0 else 0 24 | 25 | # Get sample of duplicate rows for LLM analysis 26 | sample_duplicates = "" 27 | if duplicate_count > 0: 28 | sample_df = duplicate_rows.head(10).applymap(truncate_cell) 29 | sample_duplicates = sample_df.to_csv(index=False, quoting=1) 30 | 31 | # Get basic table info for context 32 | table_sample = df.head(5).applymap(truncate_cell).to_csv(index=False, quoting=1) 33 | 34 | return { 35 | "duplicate_count": duplicate_count, 36 | "duplicate_percentage": duplicate_percentage, 37 | "total_rows": len(df), 38 | "sample_duplicates": sample_duplicates, 39 | "table_sample": table_sample 40 | } 41 | 42 | def exec(self, prep_res): 43 | if prep_res["duplicate_count"] == 0: 44 | return { 45 | "should_remove": False, 46 | "analysis": "No duplicate rows found in the dataset." 47 | } 48 | 49 | prompt = f""" 50 | You have a table with {prep_res["total_rows"]} total rows and {prep_res["duplicate_count"]} duplicate rows ({prep_res["duplicate_percentage"]:.2f}%). 51 | 52 | Sample of the table: 53 | {prep_res["table_sample"]} 54 | 55 | Sample duplicate rows: 56 | {prep_res["sample_duplicates"]} 57 | 58 | Analyze these duplicates and decide whether they should be removed. 59 | 60 | Return in YAML format: 61 | ```yaml 62 | should_remove: true/false 63 | analysis: "Brief analysis explaining why duplicates should/shouldn't be removed" 64 | ``` 65 | """ 66 | 67 | response = call_llm(prompt) 68 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 69 | result = yaml.safe_load(yaml_str) 70 | 71 | assert "should_remove" in result 72 | assert "analysis" in result 73 | assert isinstance(result["should_remove"], bool) 74 | assert isinstance(result["analysis"], str) 75 | 76 | return result 77 | 78 | def post(self, shared, prep_res, exec_res): 79 | shared["profile_results"]["duplicates"] = { 80 | "count": prep_res["duplicate_count"], 81 | "percentage": prep_res["duplicate_percentage"], 82 | "total_rows": prep_res["total_rows"], 83 | "should_remove": exec_res["should_remove"], 84 | "analysis": exec_res["analysis"], 85 | "sample_rows": prep_res["sample_duplicates"] 86 | } 87 | 88 | class TableSummaryNode(Node): 89 | def prep(self, shared): 90 | df = shared["dataframe"] 91 | 92 | # Create a sample for LLM analysis 93 | sample_df = df.head(50).applymap(truncate_cell) 94 | sample_data = sample_df.to_csv(index=False, quoting=1) 95 | 96 | # Basic info 97 | column_names = list(df.columns) 98 | row_count = len(df) 99 | 100 | return { 101 | "sample_data": sample_data, 102 | "column_names": column_names, 103 | "row_count": row_count 104 | } 105 | 106 | def exec(self, prep_res): 107 | columns_str = ", ".join(prep_res["column_names"]) 108 | 109 | prompt = f""" 110 | You have a table with {prep_res["row_count"]} rows and the following columns: {columns_str} 111 | 112 | Sample data: 113 | {prep_res["sample_data"]} 114 | 115 | Task: Summarize what this table represents. 116 | - Highlight: Include and highlight ALL column names as **Column_Name** 117 | - Structure: Start with the big picture, then explain how columns are related 118 | - Requirement: ALL column names must be mentioned and **highlighted**. Use exact column names (case sensitive) 119 | - Style: Use a few short sentences with simple words 120 | 121 | Example: "The table contains information about ... with **Customer_ID**, **Order_Date**, and **Amount**..." 122 | 123 | Your summary: 124 | """ 125 | 126 | return call_llm(prompt) 127 | 128 | def post(self, shared, prep_res, exec_res): 129 | shared["profile_results"]["table_summary"] = exec_res 130 | return "default" 131 | 132 | class ColumnDescriptionNode(BatchNode): 133 | def prep(self, shared): 134 | df = shared["dataframe"] 135 | columns = list(df.columns) 136 | 137 | # Process columns in chunks of 10 138 | chunks = [] 139 | for i in range(0, len(columns), 10): 140 | chunk_columns = columns[i:i + 10] 141 | chunk_df = df[chunk_columns].head(5).applymap(truncate_cell) 142 | chunk_sample = chunk_df.to_csv(index=False, quoting=1) 143 | chunks.append((chunk_columns, chunk_sample)) 144 | 145 | return chunks 146 | 147 | def exec(self, chunk_data): 148 | chunk_columns, chunk_sample = chunk_data 149 | 150 | prompt = f""" 151 | You have the following table columns and sample data: 152 | {chunk_sample} 153 | 154 | For each column, provide a short description and suggest a better name if needed. 155 | 156 | Return in YAML format: 157 | ```yaml 158 | {chunk_columns[0]}: 159 | description: "Short description" 160 | suggested_name: "new_column_name" 161 | ... 162 | ``` 163 | """ 164 | 165 | response = call_llm(prompt) 166 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 167 | result = yaml.safe_load(yaml_str) 168 | 169 | # Validate all columns are present with required fields 170 | for col in chunk_columns: 171 | assert col in result, f"Column {col} missing from result" 172 | assert "description" in result[col], f"Description missing for {col}" 173 | assert "suggested_name" in result[col], f"Suggested name missing for {col}" 174 | assert isinstance(result[col]["description"], str) 175 | assert isinstance(result[col]["suggested_name"], str) 176 | 177 | return result 178 | 179 | def post(self, shared, prep_res, exec_res_list): 180 | # Combine results from all chunks 181 | all_descriptions = {} 182 | for chunk_result in exec_res_list: 183 | all_descriptions.update(chunk_result) 184 | 185 | # Convert to the expected format (now already in the right structure from YAML) 186 | shared["profile_results"]["column_descriptions"] = all_descriptions 187 | return "default" 188 | 189 | class DataTypeAnalysisNode(Node): 190 | def prep(self, shared): 191 | df = shared["dataframe"] 192 | 193 | # Get current data types 194 | current_types = {col: str(df[col].dtype) for col in df.columns} 195 | 196 | # Get sample data 197 | sample_df = df.head(10).applymap(truncate_cell) 198 | sample_data = sample_df.to_csv(index=False, quoting=1) 199 | 200 | return { 201 | "sample_data": sample_data, 202 | "current_types": current_types, 203 | "columns": list(df.columns) 204 | } 205 | 206 | def exec(self, prep_res): 207 | types_info = "\n".join([f"{col}: currently {dtype}" for col, dtype in prep_res["current_types"].items()]) 208 | valid_types = ["int64", "float64", "object", "datetime64", "bool", "category"] 209 | 210 | prompt = f""" 211 | You have the following table with current data types: 212 | {types_info} 213 | 214 | Sample data: 215 | {prep_res["sample_data"]} 216 | 217 | For each column, suggest the most appropriate data type from: {valid_types} 218 | 219 | Return in YAML format: 220 | ```yaml 221 | column1: 222 | suggested_type: "int64" 223 | reason: "Contains only integer values" 224 | ... 225 | ``` 226 | """ 227 | 228 | response = call_llm(prompt) 229 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 230 | result = yaml.safe_load(yaml_str) 231 | 232 | # Validate all columns are present with required fields 233 | for col in prep_res["columns"]: 234 | assert col in result, f"Column {col} missing from result" 235 | assert "suggested_type" in result[col], f"Suggested type missing for {col}" 236 | assert "reason" in result[col], f"Reason missing for {col}" 237 | assert result[col]["suggested_type"] in valid_types, f"Invalid type for {col}: {result[col]['suggested_type']}" 238 | assert isinstance(result[col]["reason"], str) 239 | 240 | return result 241 | 242 | def post(self, shared, prep_res, exec_res): 243 | # Combine current and suggested types 244 | data_types = {} 245 | for col in prep_res["columns"]: 246 | data_types[col] = { 247 | "current_type": prep_res["current_types"][col], 248 | "suggested_type": exec_res[col]["suggested_type"], 249 | "reason": exec_res[col]["reason"] 250 | } 251 | 252 | shared["profile_results"]["data_types"] = data_types 253 | return "default" 254 | 255 | class MissingValuesAnalysisNode(Node): 256 | def prep(self, shared): 257 | df = shared["dataframe"] 258 | 259 | # Calculate missing values 260 | missing_info = {} 261 | for col in df.columns: 262 | missing_count = df[col].isna().sum() 263 | if missing_count > 0: 264 | missing_percentage = (missing_count / len(df)) * 100 265 | missing_info[col] = { 266 | "count": missing_count, 267 | "percentage": missing_percentage 268 | } 269 | 270 | # Get sample data 271 | sample_df = df.head(10).applymap(truncate_cell) 272 | sample_data = sample_df.to_csv(index=False, quoting=1) 273 | 274 | return { 275 | "missing_info": missing_info, 276 | "sample_data": sample_data, 277 | "total_rows": len(df) 278 | } 279 | 280 | def exec(self, prep_res): 281 | if not prep_res["missing_info"]: 282 | return { 283 | "reasoning": "No missing values found in any columns.", 284 | "columns_analysis": {} 285 | } 286 | 287 | missing_desc = "\n".join([ 288 | f"{col}: {info['count']} missing ({info['percentage']:.1f}%)" 289 | for col, info in prep_res["missing_info"].items() 290 | ]) 291 | 292 | prompt = f""" 293 | You have a table with the following missing values: 294 | {missing_desc} 295 | 296 | Sample data for context: 297 | {prep_res["sample_data"]} 298 | 299 | For each column with missing values, determine if missing values are meaningful or problematic. 300 | 301 | Return in YAML format: 302 | ```yaml 303 | overall_analysis: "Brief overall analysis" 304 | columns: 305 | column_name: 306 | is_meaningful: true/false 307 | reason: "Brief explanation" 308 | ... 309 | ``` 310 | """ 311 | 312 | response = call_llm(prompt) 313 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 314 | result = yaml.safe_load(yaml_str) 315 | 316 | # Validate structure 317 | assert "overall_analysis" in result 318 | assert "columns" in result 319 | assert isinstance(result["overall_analysis"], str) 320 | assert isinstance(result["columns"], dict) 321 | 322 | # Validate each column analysis 323 | for col in prep_res["missing_info"].keys(): 324 | assert col in result["columns"], f"Missing analysis for column {col}" 325 | assert "is_meaningful" in result["columns"][col] 326 | assert "reason" in result["columns"][col] 327 | assert isinstance(result["columns"][col]["is_meaningful"], bool) 328 | assert isinstance(result["columns"][col]["reason"], str) 329 | 330 | return result 331 | 332 | def post(self, shared, prep_res, exec_res): 333 | missing_values = {} 334 | 335 | # Process columns with missing values 336 | for col, info in prep_res["missing_info"].items(): 337 | analysis = exec_res["columns"][col] 338 | missing_values[col] = { 339 | "count": info["count"], 340 | "percentage": info["percentage"], 341 | "is_meaningful": analysis["is_meaningful"], 342 | "reason": analysis["reason"] 343 | } 344 | 345 | # Add columns with no missing values 346 | df = shared["dataframe"] 347 | for col in df.columns: 348 | if col not in missing_values: 349 | missing_values[col] = { 350 | "count": 0, 351 | "percentage": 0.0, 352 | "is_meaningful": True, 353 | "reason": "No missing values" 354 | } 355 | 356 | shared["profile_results"]["missing_values"] = missing_values 357 | shared["profile_results"]["missing_analysis"] = exec_res["overall_analysis"] 358 | return "default" 359 | 360 | class UniquenessAnalysisNode(Node): 361 | def prep(self, shared): 362 | df = shared["dataframe"] 363 | 364 | # Calculate uniqueness for each column 365 | uniqueness_info = {} 366 | for col in df.columns: 367 | unique_count = df[col].nunique() 368 | total_count = len(df) 369 | unique_percentage = (unique_count / total_count) * 100 if total_count > 0 else 0 370 | 371 | uniqueness_info[col] = { 372 | "unique_count": unique_count, 373 | "total_count": total_count, 374 | "unique_percentage": unique_percentage 375 | } 376 | 377 | # Get sample data and table summary for context 378 | sample_df = df.head(10).applymap(truncate_cell) 379 | sample_data = sample_df.to_csv(index=False, quoting=1) 380 | table_summary = shared["profile_results"].get("table_summary", "") 381 | 382 | # Get highly unique columns (>90% unique) 383 | highly_unique = {col: info for col, info in uniqueness_info.items() 384 | if info["unique_percentage"] > 90} 385 | 386 | return { 387 | "uniqueness_info": uniqueness_info, 388 | "highly_unique": highly_unique, 389 | "sample_data": sample_data, 390 | "table_summary": table_summary 391 | } 392 | 393 | def exec(self, prep_res): 394 | if not prep_res["highly_unique"]: 395 | return { 396 | "reasoning": "No columns found that could serve as candidate keys.", 397 | "candidate_keys": {} 398 | } 399 | 400 | highly_unique_desc = "\n".join([ 401 | f"{col}: {info['unique_count']}/{info['total_count']} unique ({info['unique_percentage']:.1f}%)" 402 | for col, info in prep_res["highly_unique"].items() 403 | ]) 404 | 405 | prompt = f""" 406 | Table context: {prep_res["table_summary"]} 407 | 408 | Sample data: 409 | {prep_res["sample_data"]} 410 | 411 | The following columns have high uniqueness: 412 | {highly_unique_desc} 413 | 414 | Analyze which columns could serve as candidate keys (unique identifiers) for this table. 415 | Consider: 416 | - What each row represents in this table 417 | - Whether the column values should be unique across all rows 418 | - Avoid continuous numerical values (like temperatures, prices) that happen to be unique in the sample 419 | 420 | Return in YAML format: 421 | ```yaml 422 | reasoning: "Analysis of which columns can serve as identifiers..." 423 | candidate_keys: 424 | column_name: 425 | is_candidate_key: true/false 426 | explanation: "Why this column is/isn't a good candidate key" 427 | ... 428 | ``` 429 | """ 430 | 431 | response = call_llm(prompt) 432 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 433 | return yaml.safe_load(yaml_str) 434 | 435 | def post(self, shared, prep_res, exec_res): 436 | uniqueness = {} 437 | 438 | for col, info in prep_res["uniqueness_info"].items(): 439 | candidate_analysis = exec_res.get("candidate_keys", {}).get(col, {}) 440 | uniqueness[col] = { 441 | "unique_count": info["unique_count"], 442 | "unique_percentage": info["unique_percentage"], 443 | "is_candidate_key": candidate_analysis.get("is_candidate_key", False), 444 | "explanation": candidate_analysis.get("explanation", "") 445 | } 446 | 447 | shared["profile_results"]["uniqueness"] = uniqueness 448 | shared["profile_results"]["uniqueness_reasoning"] = exec_res.get("reasoning", "") 449 | return "default" 450 | 451 | class UnusualValuesDetectionNode(BatchNode): 452 | def prep(self, shared): 453 | df = shared["dataframe"] 454 | columns = list(df.columns) 455 | 456 | # Create analysis tasks for each column 457 | column_tasks = [] 458 | for col in columns: 459 | # Get sample of distinct values (up to 1000 for inspection) 460 | sample_values = df[col].dropna().drop_duplicates().head(1000) 461 | sample_list = [truncate_cell(val, 100) for val in sample_values] 462 | 463 | column_tasks.append({ 464 | "column_name": col, 465 | "sample_values": sample_list, 466 | "data_type": str(df[col].dtype) 467 | }) 468 | 469 | return column_tasks 470 | 471 | def exec(self, column_task): 472 | col_name = column_task["column_name"] 473 | sample_values = column_task["sample_values"] 474 | data_type = column_task["data_type"] 475 | 476 | if not sample_values: 477 | return { 478 | "column_name": col_name, 479 | "has_unusual": False, 480 | "explanation": "No values to analyze (all missing)" 481 | } 482 | 483 | values_str = ", ".join([f"'{val}'" for val in sample_values[:15]]) 484 | 485 | prompt = f""" 486 | Column "{col_name}" (type: {data_type}) has the following sample values: 487 | {values_str} 488 | 489 | Check if there are any unusual values that seem wrong or inconsistent. 490 | 491 | Return in YAML format: 492 | ```yaml 493 | has_unusual: true/false 494 | explanation: "Brief explanation of findings" 495 | ``` 496 | """ 497 | 498 | response = call_llm(prompt) 499 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 500 | result = yaml.safe_load(yaml_str) 501 | 502 | # Validate structure 503 | assert "has_unusual" in result 504 | assert "explanation" in result 505 | assert isinstance(result["has_unusual"], bool) 506 | assert isinstance(result["explanation"], str) 507 | 508 | result["column_name"] = col_name 509 | return result 510 | 511 | def post(self, shared, prep_res, exec_res_list): 512 | unusual_values = {} 513 | 514 | for result in exec_res_list: 515 | col_name = result["column_name"] 516 | unusual_values[col_name] = { 517 | "has_unusual": result["has_unusual"], 518 | "explanation": result["explanation"] 519 | } 520 | 521 | shared["profile_results"]["unusual_values"] = unusual_values 522 | return "default" 523 | 524 | class GenerateReportNode(Node): 525 | def prep(self, shared): 526 | return shared["profile_results"] 527 | 528 | def exec(self, profile_results): 529 | # Generate a comprehensive report 530 | report_sections = [] 531 | 532 | # Title 533 | report_sections.append("# Data Profiling Report\n") 534 | 535 | # Table Summary 536 | if "table_summary" in profile_results: 537 | report_sections.append("## Table Summary") 538 | report_sections.append(profile_results["table_summary"]) 539 | report_sections.append("") 540 | 541 | # Duplicates 542 | if "duplicates" in profile_results: 543 | dup = profile_results["duplicates"] 544 | report_sections.append("## Duplicate Analysis") 545 | report_sections.append(f"- **Total rows**: {dup['total_rows']}") 546 | report_sections.append(f"- **Duplicate rows**: {dup['count']} ({dup['percentage']:.2f}%)") 547 | report_sections.append(f"- **Should remove**: {dup['should_remove']}") 548 | report_sections.append(f"- **Analysis**: {dup['analysis']}") 549 | report_sections.append("") 550 | 551 | # Column Descriptions 552 | if "column_descriptions" in profile_results: 553 | report_sections.append("## Column Descriptions") 554 | for col, info in profile_results["column_descriptions"].items(): 555 | suggested = f" → *{info['suggested_name']}*" if info['suggested_name'] != col else "" 556 | report_sections.append(f"- **{col}**{suggested}: {info['description']}") 557 | report_sections.append("") 558 | 559 | # Data Types 560 | if "data_types" in profile_results: 561 | report_sections.append("## Data Type Analysis") 562 | changes_found = False 563 | for col, info in profile_results["data_types"].items(): 564 | if info['suggested_type'] != info['current_type']: 565 | report_sections.append(f"- **{col}**: {info['current_type']} → *{info['suggested_type']}* ({info['reason']})") 566 | changes_found = True 567 | if not changes_found: 568 | report_sections.append("- All data types are appropriate") 569 | report_sections.append("") 570 | 571 | # Missing Values 572 | if "missing_values" in profile_results: 573 | report_sections.append("## Missing Values Analysis") 574 | if "missing_analysis" in profile_results: 575 | report_sections.append(f"**Overview**: {profile_results['missing_analysis']}") 576 | report_sections.append("") 577 | 578 | problematic_missing = [] 579 | meaningful_missing = [] 580 | 581 | for col, info in profile_results["missing_values"].items(): 582 | if info['count'] > 0: 583 | entry = f"**{col}**: {info['count']} missing ({info['percentage']:.1f}%) - {info['reason']}" 584 | if info['is_meaningful']: 585 | meaningful_missing.append(entry) 586 | else: 587 | problematic_missing.append(entry) 588 | 589 | if problematic_missing: 590 | report_sections.append("### Problematic Missing Values") 591 | for entry in problematic_missing: 592 | report_sections.append(f"- {entry}") 593 | report_sections.append("") 594 | 595 | if meaningful_missing: 596 | report_sections.append("### Likely Meaningful Missing Values") 597 | for entry in meaningful_missing: 598 | report_sections.append(f"- {entry}") 599 | report_sections.append("") 600 | 601 | # Uniqueness 602 | if "uniqueness" in profile_results: 603 | report_sections.append("## Uniqueness Analysis") 604 | candidate_keys = [] 605 | highly_unique = [] 606 | 607 | for col, info in profile_results["uniqueness"].items(): 608 | if info['is_candidate_key']: 609 | candidate_keys.append(f"**{col}**: {info['explanation']}") 610 | elif info['unique_percentage'] > 50: 611 | highly_unique.append(f"**{col}**: {info['unique_percentage']:.1f}% unique") 612 | 613 | if candidate_keys: 614 | report_sections.append("### Candidate Key Columns") 615 | for key in candidate_keys: 616 | report_sections.append(f"- {key}") 617 | report_sections.append("") 618 | 619 | if highly_unique: 620 | report_sections.append("### Highly Unique Columns") 621 | for col in highly_unique: 622 | report_sections.append(f"- {col}") 623 | report_sections.append("") 624 | 625 | # Unusual Values 626 | if "unusual_values" in profile_results: 627 | report_sections.append("## Unusual Values Detection") 628 | unusual_found = [] 629 | 630 | for col, info in profile_results["unusual_values"].items(): 631 | if info['has_unusual']: 632 | unusual_found.append(f"**{col}**: {info['explanation']}") 633 | 634 | if unusual_found: 635 | for finding in unusual_found: 636 | report_sections.append(f"- {finding}") 637 | else: 638 | report_sections.append("- No unusual values detected") 639 | report_sections.append("") 640 | 641 | return "\n".join(report_sections) 642 | 643 | def post(self, shared, prep_res, exec_res): 644 | shared["final_report"] = exec_res 645 | print("Data profiling complete! Report generated.") 646 | return "default" -------------------------------------------------------------------------------- /.clinerules: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: "Agentic Coding" 4 | --- 5 | 6 | # Agentic Coding: Humans Design, Agents code! 7 | 8 | > If you are an AI agent involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification. 9 | {: .warning } 10 | 11 | ## Agentic Coding Steps 12 | 13 | Agentic Coding should be a collaboration between Human System Design and Agent Implementation: 14 | 15 | | Steps | Human | AI | Comment | 16 | |:-----------------------|:----------:|:---------:|:------------------------------------------------------------------------| 17 | | 1. Requirements | ★★★ High | ★☆☆ Low | Humans understand the requirements and context. | 18 | | 2. Flow | ★★☆ Medium | ★★☆ Medium | Humans specify the high-level design, and the AI fills in the details. | 19 | | 3. Utilities | ★★☆ Medium | ★★☆ Medium | Humans provide available external APIs and integrations, and the AI helps with implementation. | 20 | | 4. Data | ★☆☆ Low | ★★★ High | AI designs the data schema, and humans verify. | 21 | | 5. Node | ★☆☆ Low | ★★★ High | The AI helps design the node based on the flow. | 22 | | 6. Implementation | ★☆☆ Low | ★★★ High | The AI implements the flow based on the design. | 23 | | 7. Optimization | ★★☆ Medium | ★★☆ Medium | Humans evaluate the results, and the AI helps optimize. | 24 | | 8. Reliability | ★☆☆ Low | ★★★ High | The AI writes test cases and addresses corner cases. | 25 | 26 | 1. **Requirements**: Clarify the requirements for your project, and evaluate whether an AI system is a good fit. 27 | - Understand AI systems' strengths and limitations: 28 | - **Good for**: Routine tasks requiring common sense (filling forms, replying to emails) 29 | - **Good for**: Creative tasks with well-defined inputs (building slides, writing SQL) 30 | - **Not good for**: Ambiguous problems requiring complex decision-making (business strategy, startup planning) 31 | - **Keep It User-Centric:** Explain the "problem" from the user's perspective rather than just listing features. 32 | - **Balance complexity vs. impact**: Aim to deliver the highest value features with minimal complexity early. 33 | 34 | 2. **Flow Design**: Outline at a high level, describe how your AI system orchestrates nodes. 35 | - Identify applicable design patterns (e.g., [Map Reduce](./design_pattern/mapreduce.md), [Agent](./design_pattern/agent.md), [RAG](./design_pattern/rag.md)). 36 | - For each node in the flow, start with a high-level one-line description of what it does. 37 | - If using **Map Reduce**, specify how to map (what to split) and how to reduce (how to combine). 38 | - If using **Agent**, specify what are the inputs (context) and what are the possible actions. 39 | - If using **RAG**, specify what to embed, noting that there's usually both offline (indexing) and online (retrieval) workflows. 40 | - Outline the flow and draw it in a mermaid diagram. For example: 41 | ```mermaid 42 | flowchart LR 43 | start[Start] --> batch[Batch] 44 | batch --> check[Check] 45 | check -->|OK| process 46 | check -->|Error| fix[Fix] 47 | fix --> check 48 | 49 | subgraph process[Process] 50 | step1[Step 1] --> step2[Step 2] 51 | end 52 | 53 | process --> endNode[End] 54 | ``` 55 | - > **If Humans can't specify the flow, AI Agents can't automate it!** Before building an LLM system, thoroughly understand the problem and potential solution by manually solving example inputs to develop intuition. 56 | {: .best-practice } 57 | 58 | 3. **Utilities**: Based on the Flow Design, identify and implement necessary utility functions. 59 | - Think of your AI system as the brain. It needs a body—these *external utility functions*—to interact with the real world: 60 |
61 | 62 | - Reading inputs (e.g., retrieving Slack messages, reading emails) 63 | - Writing outputs (e.g., generating reports, sending emails) 64 | - Using external tools (e.g., calling LLMs, searching the web) 65 | - **NOTE**: *LLM-based tasks* (e.g., summarizing text, analyzing sentiment) are **NOT** utility functions; rather, they are *core functions* internal in the AI system. 66 | - For each utility function, implement it and write a simple test. 67 | - Document their input/output, as well as why they are necessary. For example: 68 | - `name`: `get_embedding` (`utils/get_embedding.py`) 69 | - `input`: `str` 70 | - `output`: a vector of 3072 floats 71 | - `necessity`: Used by the second node to embed text 72 | - Example utility implementation: 73 | ```python 74 | # utils/call_llm.py 75 | from openai import OpenAI 76 | 77 | def call_llm(prompt): 78 | client = OpenAI(api_key="YOUR_API_KEY_HERE") 79 | r = client.chat.completions.create( 80 | model="gpt-4o", 81 | messages=[{"role": "user", "content": prompt}] 82 | ) 83 | return r.choices[0].message.content 84 | 85 | if __name__ == "__main__": 86 | prompt = "What is the meaning of life?" 87 | print(call_llm(prompt)) 88 | ``` 89 | - > **Sometimes, design Utilities before Flow:** For example, for an LLM project to automate a legacy system, the bottleneck will likely be the available interface to that system. Start by designing the hardest utilities for interfacing, and then build the flow around them. 90 | {: .best-practice } 91 | - > **Avoid Exception Handling in Utilities**: If a utility function is called from a Node's `exec()` method, avoid using `try...except` blocks within the utility. Let the Node's built-in retry mechanism handle failures. 92 | {: .warning } 93 | 94 | 4. **Data Design**: Design the shared store that nodes will use to communicate. 95 | - One core design principle for PocketFlow is to use a well-designed [shared store](./core_abstraction/communication.md)—a data contract that all nodes agree upon to retrieve and store data. 96 | - For simple systems, use an in-memory dictionary. 97 | - For more complex systems or when persistence is required, use a database. 98 | - **Don't Repeat Yourself**: Use in-memory references or foreign keys. 99 | - Example shared store design: 100 | ```python 101 | shared = { 102 | "user": { 103 | "id": "user123", 104 | "context": { # Another nested dict 105 | "weather": {"temp": 72, "condition": "sunny"}, 106 | "location": "San Francisco" 107 | } 108 | }, 109 | "results": {} # Empty dict to store outputs 110 | } 111 | ``` 112 | 113 | 5. **Node Design**: Plan how each node will read and write data, and use utility functions. 114 | - For each [Node](./core_abstraction/node.md), describe its type, how it reads and writes data, and which utility function it uses. Keep it specific but high-level without codes. For example: 115 | - `type`: Regular (or Batch, or Async) 116 | - `prep`: Read "text" from the shared store 117 | - `exec`: Call the embedding utility function. **Avoid exception handling here**; let the Node's retry mechanism manage failures. 118 | - `post`: Write "embedding" to the shared store 119 | 120 | 6. **Implementation**: Implement the initial nodes and flows based on the design. 121 | - 🎉 If you've reached this step, humans have finished the design. Now *Agentic Coding* begins! 122 | - **"Keep it simple, stupid!"** Avoid complex features and full-scale type checking. 123 | - **FAIL FAST**! Leverage the built-in [Node](./core_abstraction/node.md) retry and fallback mechanisms to handle failures gracefully. This helps you quickly identify weak points in the system. 124 | - Add logging throughout the code to facilitate debugging. 125 | 126 | 7. **Optimization**: 127 | - **Use Intuition**: For a quick initial evaluation, human intuition is often a good start. 128 | - **Redesign Flow (Back to Step 3)**: Consider breaking down tasks further, introducing agentic decisions, or better managing input contexts. 129 | - If your flow design is already solid, move on to micro-optimizations: 130 | - **Prompt Engineering**: Use clear, specific instructions with examples to reduce ambiguity. 131 | - **In-Context Learning**: Provide robust examples for tasks that are difficult to specify with instructions alone. 132 | 133 | - > **You'll likely iterate a lot!** Expect to repeat Steps 3–6 hundreds of times. 134 | > 135 | >
136 | {: .best-practice } 137 | 138 | 8. **Reliability** 139 | - **Node Retries**: Add checks in the node `exec` to ensure outputs meet requirements, and consider increasing `max_retries` and `wait` times. 140 | - **Logging and Visualization**: Maintain logs of all attempts and visualize node results for easier debugging. 141 | - **Self-Evaluation**: Add a separate node (powered by an LLM) to review outputs when results are uncertain. 142 | 143 | ## Example LLM Project File Structure 144 | 145 | ``` 146 | my_project/ 147 | ├── main.py 148 | ├── nodes.py 149 | ├── flow.py 150 | ├── utils/ 151 | │ ├── __init__.py 152 | │ ├── call_llm.py 153 | │ └── search_web.py 154 | ├── requirements.txt 155 | └── docs/ 156 | └── design.md 157 | ``` 158 | 159 | - **`requirements.txt`**: Lists the Python dependencies for the project. 160 | ``` 161 | PyYAML 162 | pocketflow 163 | ``` 164 | 165 | - **`docs/design.md`**: Contains project documentation for each step above. This should be *high-level* and *no-code*. 166 | ~~~ 167 | # Design Doc: Your Project Name 168 | 169 | > Please DON'T remove notes for AI 170 | 171 | ## Requirements 172 | 173 | > Notes for AI: Keep it simple and clear. 174 | > If the requirements are abstract, write concrete user stories 175 | 176 | 177 | ## Flow Design 178 | 179 | > Notes for AI: 180 | > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit. 181 | > 2. Present a concise, high-level description of the workflow. 182 | 183 | ### Applicable Design Pattern: 184 | 185 | 1. Map the file summary into chunks, then reduce these chunks into a final summary. 186 | 2. Agentic file finder 187 | - *Context*: The entire summary of the file 188 | - *Action*: Find the file 189 | 190 | ### Flow high-level Design: 191 | 192 | 1. **First Node**: This node is for ... 193 | 2. **Second Node**: This node is for ... 194 | 3. **Third Node**: This node is for ... 195 | 196 | ```mermaid 197 | flowchart TD 198 | firstNode[First Node] --> secondNode[Second Node] 199 | secondNode --> thirdNode[Third Node] 200 | ``` 201 | ## Utility Functions 202 | 203 | > Notes for AI: 204 | > 1. Understand the utility function definition thoroughly by reviewing the doc. 205 | > 2. Include only the necessary utility functions, based on nodes in the flow. 206 | 207 | 1. **Call LLM** (`utils/call_llm.py`) 208 | - *Input*: prompt (str) 209 | - *Output*: response (str) 210 | - Generally used by most nodes for LLM tasks 211 | 212 | 2. **Embedding** (`utils/get_embedding.py`) 213 | - *Input*: str 214 | - *Output*: a vector of 3072 floats 215 | - Used by the second node to embed text 216 | 217 | ## Node Design 218 | 219 | ### Shared Store 220 | 221 | > Notes for AI: Try to minimize data redundancy 222 | 223 | The shared store structure is organized as follows: 224 | 225 | ```python 226 | shared = { 227 | "key": "value" 228 | } 229 | ``` 230 | 231 | ### Node Steps 232 | 233 | > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow. 234 | 235 | 1. First Node 236 | - *Purpose*: Provide a short explanation of the node’s function 237 | - *Type*: Decide between Regular, Batch, or Async 238 | - *Steps*: 239 | - *prep*: Read "key" from the shared store 240 | - *exec*: Call the utility function 241 | - *post*: Write "key" to the shared store 242 | 243 | 2. Second Node 244 | ... 245 | ~~~ 246 | 247 | 248 | - **`utils/`**: Contains all utility functions. 249 | - It's recommended to dedicate one Python file to each API call, for example `call_llm.py` or `search_web.py`. 250 | - Each file should also include a `main()` function to try that API call 251 | ```python 252 | from google import genai 253 | import os 254 | 255 | def call_llm(prompt: str) -> str: 256 | client = genai.Client( 257 | api_key=os.getenv("GEMINI_API_KEY", ""), 258 | ) 259 | model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash") 260 | response = client.models.generate_content(model=model, contents=[prompt]) 261 | return response.text 262 | 263 | if __name__ == "__main__": 264 | test_prompt = "Hello, how are you?" 265 | 266 | # First call - should hit the API 267 | print("Making call...") 268 | response1 = call_llm(test_prompt, use_cache=False) 269 | print(f"Response: {response1}") 270 | ``` 271 | 272 | - **`nodes.py`**: Contains all the node definitions. 273 | ```python 274 | # nodes.py 275 | from pocketflow import Node 276 | from utils.call_llm import call_llm 277 | 278 | class GetQuestionNode(Node): 279 | def exec(self, _): 280 | # Get question directly from user input 281 | user_question = input("Enter your question: ") 282 | return user_question 283 | 284 | def post(self, shared, prep_res, exec_res): 285 | # Store the user's question 286 | shared["question"] = exec_res 287 | return "default" # Go to the next node 288 | 289 | class AnswerNode(Node): 290 | def prep(self, shared): 291 | # Read question from shared 292 | return shared["question"] 293 | 294 | def exec(self, question): 295 | # Call LLM to get the answer 296 | return call_llm(question) 297 | 298 | def post(self, shared, prep_res, exec_res): 299 | # Store the answer in shared 300 | shared["answer"] = exec_res 301 | ``` 302 | - **`flow.py`**: Implements functions that create flows by importing node definitions and connecting them. 303 | ```python 304 | # flow.py 305 | from pocketflow import Flow 306 | from nodes import GetQuestionNode, AnswerNode 307 | 308 | def create_qa_flow(): 309 | """Create and return a question-answering flow.""" 310 | # Create nodes 311 | get_question_node = GetQuestionNode() 312 | answer_node = AnswerNode() 313 | 314 | # Connect nodes in sequence 315 | get_question_node >> answer_node 316 | 317 | # Create flow starting with input node 318 | return Flow(start=get_question_node) 319 | ``` 320 | - **`main.py`**: Serves as the project's entry point. 321 | ```python 322 | # main.py 323 | from flow import create_qa_flow 324 | 325 | # Example main function 326 | # Please replace this with your own main function 327 | def main(): 328 | shared = { 329 | "question": None, # Will be populated by GetQuestionNode from user input 330 | "answer": None # Will be populated by AnswerNode 331 | } 332 | 333 | # Create the flow and run it 334 | qa_flow = create_qa_flow() 335 | qa_flow.run(shared) 336 | print(f"Question: {shared['question']}") 337 | print(f"Answer: {shared['answer']}") 338 | 339 | if __name__ == "__main__": 340 | main() 341 | ``` 342 | 343 | ================================================ 344 | File: docs/index.md 345 | ================================================ 346 | --- 347 | layout: default 348 | title: "Home" 349 | nav_order: 1 350 | --- 351 | 352 | # Pocket Flow 353 | 354 | A [100-line](https://github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*. 355 | 356 | - **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in. 357 | - **Expressive**: Everything you love from larger frameworks—([Multi-](./design_pattern/multi_agent.html))[Agents](./design_pattern/agent.html), [Workflow](./design_pattern/workflow.html), [RAG](./design_pattern/rag.html), and more. 358 | - **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications. 359 | 360 |
361 | 362 |
363 | 364 | ## Core Abstraction 365 | 366 | We model the LLM workflow as a **Graph + Shared Store**: 367 | 368 | - [Node](./core_abstraction/node.md) handles simple (LLM) tasks. 369 | - [Flow](./core_abstraction/flow.md) connects nodes through **Actions** (labeled edges). 370 | - [Shared Store](./core_abstraction/communication.md) enables communication between nodes within flows. 371 | - [Batch](./core_abstraction/batch.md) nodes/flows allow for data-intensive tasks. 372 | - [Async](./core_abstraction/async.md) nodes/flows allow waiting for asynchronous tasks. 373 | - [(Advanced) Parallel](./core_abstraction/parallel.md) nodes/flows handle I/O-bound tasks. 374 | 375 |
376 | 377 |
378 | 379 | ## Design Pattern 380 | 381 | From there, it’s easy to implement popular design patterns: 382 | 383 | - [Agent](./design_pattern/agent.md) autonomously makes decisions. 384 | - [Workflow](./design_pattern/workflow.md) chains multiple tasks into pipelines. 385 | - [RAG](./design_pattern/rag.md) integrates data retrieval with generation. 386 | - [Map Reduce](./design_pattern/mapreduce.md) splits data tasks into Map and Reduce steps. 387 | - [Structured Output](./design_pattern/structure.md) formats outputs consistently. 388 | - [(Advanced) Multi-Agents](./design_pattern/multi_agent.md) coordinate multiple agents. 389 | 390 |
391 | 392 |
393 | 394 | ## Utility Function 395 | 396 | We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*: 397 | 398 | - [LLM Wrapper](./utility_function/llm.md) 399 | - [Viz and Debug](./utility_function/viz.md) 400 | - [Web Search](./utility_function/websearch.md) 401 | - [Chunking](./utility_function/chunking.md) 402 | - [Embedding](./utility_function/embedding.md) 403 | - [Vector Databases](./utility_function/vector.md) 404 | - [Text-to-Speech](./utility_function/text_to_speech.md) 405 | 406 | **Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework: 407 | - *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs. 408 | - *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally. 409 | - *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in. 410 | 411 | ## Ready to build your Apps? 412 | 413 | Check out [Agentic Coding Guidance](./guide.md), the fastest way to develop LLM projects with Pocket Flow! 414 | 415 | ================================================ 416 | File: docs/core_abstraction/async.md 417 | ================================================ 418 | --- 419 | layout: default 420 | title: "(Advanced) Async" 421 | parent: "Core Abstraction" 422 | nav_order: 5 423 | --- 424 | 425 | # (Advanced) Async 426 | 427 | **Async** Nodes implement `prep_async()`, `exec_async()`, `exec_fallback_async()`, and/or `post_async()`. This is useful for: 428 | 429 | 1. **prep_async()**: For *fetching/reading data (files, APIs, DB)* in an I/O-friendly way. 430 | 2. **exec_async()**: Typically used for async LLM calls. 431 | 3. **post_async()**: For *awaiting user feedback*, *coordinating across multi-agents* or any additional async steps after `exec_async()`. 432 | 433 | **Note**: `AsyncNode` must be wrapped in `AsyncFlow`. `AsyncFlow` can also include regular (sync) nodes. 434 | 435 | ### Example 436 | 437 | ```python 438 | class SummarizeThenVerify(AsyncNode): 439 | async def prep_async(self, shared): 440 | # Example: read a file asynchronously 441 | doc_text = await read_file_async(shared["doc_path"]) 442 | return doc_text 443 | 444 | async def exec_async(self, prep_res): 445 | # Example: async LLM call 446 | summary = await call_llm_async(f"Summarize: {prep_res}") 447 | return summary 448 | 449 | async def post_async(self, shared, prep_res, exec_res): 450 | # Example: wait for user feedback 451 | decision = await gather_user_feedback(exec_res) 452 | if decision == "approve": 453 | shared["summary"] = exec_res 454 | return "approve" 455 | return "deny" 456 | 457 | summarize_node = SummarizeThenVerify() 458 | final_node = Finalize() 459 | 460 | # Define transitions 461 | summarize_node - "approve" >> final_node 462 | summarize_node - "deny" >> summarize_node # retry 463 | 464 | flow = AsyncFlow(start=summarize_node) 465 | 466 | async def main(): 467 | shared = {"doc_path": "document.txt"} 468 | await flow.run_async(shared) 469 | print("Final Summary:", shared.get("summary")) 470 | 471 | asyncio.run(main()) 472 | ``` 473 | 474 | ================================================ 475 | File: docs/core_abstraction/batch.md 476 | ================================================ 477 | --- 478 | layout: default 479 | title: "Batch" 480 | parent: "Core Abstraction" 481 | nav_order: 4 482 | --- 483 | 484 | # Batch 485 | 486 | **Batch** makes it easier to handle large inputs in one Node or **rerun** a Flow multiple times. Example use cases: 487 | - **Chunk-based** processing (e.g., splitting large texts). 488 | - **Iterative** processing over lists of input items (e.g., user queries, files, URLs). 489 | 490 | ## 1. BatchNode 491 | 492 | A **BatchNode** extends `Node` but changes `prep()` and `exec()`: 493 | 494 | - **`prep(shared)`**: returns an **iterable** (e.g., list, generator). 495 | - **`exec(item)`**: called **once** per item in that iterable. 496 | - **`post(shared, prep_res, exec_res_list)`**: after all items are processed, receives a **list** of results (`exec_res_list`) and returns an **Action**. 497 | 498 | 499 | ### Example: Summarize a Large File 500 | 501 | ```python 502 | class MapSummaries(BatchNode): 503 | def prep(self, shared): 504 | # Suppose we have a big file; chunk it 505 | content = shared["data"] 506 | chunk_size = 10000 507 | chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] 508 | return chunks 509 | 510 | def exec(self, chunk): 511 | prompt = f"Summarize this chunk in 10 words: {chunk}" 512 | summary = call_llm(prompt) 513 | return summary 514 | 515 | def post(self, shared, prep_res, exec_res_list): 516 | combined = "\n".join(exec_res_list) 517 | shared["summary"] = combined 518 | return "default" 519 | 520 | map_summaries = MapSummaries() 521 | flow = Flow(start=map_summaries) 522 | flow.run(shared) 523 | ``` 524 | 525 | --- 526 | 527 | ## 2. BatchFlow 528 | 529 | A **BatchFlow** runs a **Flow** multiple times, each time with different `params`. Think of it as a loop that replays the Flow for each parameter set. 530 | 531 | ### Example: Summarize Many Files 532 | 533 | ```python 534 | class SummarizeAllFiles(BatchFlow): 535 | def prep(self, shared): 536 | # Return a list of param dicts (one per file) 537 | filenames = list(shared["data"].keys()) # e.g., ["file1.txt", "file2.txt", ...] 538 | return [{"filename": fn} for fn in filenames] 539 | 540 | # Suppose we have a per-file Flow (e.g., load_file >> summarize >> reduce): 541 | summarize_file = SummarizeFile(start=load_file) 542 | 543 | # Wrap that flow into a BatchFlow: 544 | summarize_all_files = SummarizeAllFiles(start=summarize_file) 545 | summarize_all_files.run(shared) 546 | ``` 547 | 548 | ### Under the Hood 549 | 1. `prep(shared)` returns a list of param dicts—e.g., `[{filename: "file1.txt"}, {filename: "file2.txt"}, ...]`. 550 | 2. The **BatchFlow** loops through each dict. For each one: 551 | - It merges the dict with the BatchFlow’s own `params`. 552 | - It calls `flow.run(shared)` using the merged result. 553 | 3. This means the sub-Flow is run **repeatedly**, once for every param dict. 554 | 555 | --- 556 | 557 | ## 3. Nested or Multi-Level Batches 558 | 559 | You can nest a **BatchFlow** in another **BatchFlow**. For instance: 560 | - **Outer** batch: returns a list of diretory param dicts (e.g., `{"directory": "/pathA"}`, `{"directory": "/pathB"}`, ...). 561 | - **Inner** batch: returning a list of per-file param dicts. 562 | 563 | At each level, **BatchFlow** merges its own param dict with the parent’s. By the time you reach the **innermost** node, the final `params` is the merged result of **all** parents in the chain. This way, a nested structure can keep track of the entire context (e.g., directory + file name) at once. 564 | 565 | ```python 566 | 567 | class FileBatchFlow(BatchFlow): 568 | def prep(self, shared): 569 | directory = self.params["directory"] 570 | # e.g., files = ["file1.txt", "file2.txt", ...] 571 | files = [f for f in os.listdir(directory) if f.endswith(".txt")] 572 | return [{"filename": f} for f in files] 573 | 574 | class DirectoryBatchFlow(BatchFlow): 575 | def prep(self, shared): 576 | directories = [ "/path/to/dirA", "/path/to/dirB"] 577 | return [{"directory": d} for d in directories] 578 | 579 | # MapSummaries have params like {"directory": "/path/to/dirA", "filename": "file1.txt"} 580 | inner_flow = FileBatchFlow(start=MapSummaries()) 581 | outer_flow = DirectoryBatchFlow(start=inner_flow) 582 | ``` 583 | 584 | ================================================ 585 | File: docs/core_abstraction/communication.md 586 | ================================================ 587 | --- 588 | layout: default 589 | title: "Communication" 590 | parent: "Core Abstraction" 591 | nav_order: 3 592 | --- 593 | 594 | # Communication 595 | 596 | Nodes and Flows **communicate** in 2 ways: 597 | 598 | 1. **Shared Store (for almost all the cases)** 599 | 600 | - A global data structure (often an in-mem dict) that all nodes can read ( `prep()`) and write (`post()`). 601 | - Great for data results, large content, or anything multiple nodes need. 602 | - You shall design the data structure and populate it ahead. 603 | 604 | - > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*! This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](./batch.md). 605 | {: .best-practice } 606 | 607 | 2. **Params (only for [Batch](./batch.md))** 608 | - Each node has a local, ephemeral `params` dict passed in by the **parent Flow**, used as an identifier for tasks. Parameter keys and values shall be **immutable**. 609 | - Good for identifiers like filenames or numeric IDs, in Batch mode. 610 | 611 | If you know memory management, think of the **Shared Store** like a **heap** (shared by all function calls), and **Params** like a **stack** (assigned by the caller). 612 | 613 | --- 614 | 615 | ## 1. Shared Store 616 | 617 | ### Overview 618 | 619 | A shared store is typically an in-mem dictionary, like: 620 | ```python 621 | shared = {"data": {}, "summary": {}, "config": {...}, ...} 622 | ``` 623 | 624 | It can also contain local file handlers, DB connections, or a combination for persistence. We recommend deciding the data structure or DB schema first based on your app requirements. 625 | 626 | ### Example 627 | 628 | ```python 629 | class LoadData(Node): 630 | def post(self, shared, prep_res, exec_res): 631 | # We write data to shared store 632 | shared["data"] = "Some text content" 633 | return None 634 | 635 | class Summarize(Node): 636 | def prep(self, shared): 637 | # We read data from shared store 638 | return shared["data"] 639 | 640 | def exec(self, prep_res): 641 | # Call LLM to summarize 642 | prompt = f"Summarize: {prep_res}" 643 | summary = call_llm(prompt) 644 | return summary 645 | 646 | def post(self, shared, prep_res, exec_res): 647 | # We write summary to shared store 648 | shared["summary"] = exec_res 649 | return "default" 650 | 651 | load_data = LoadData() 652 | summarize = Summarize() 653 | load_data >> summarize 654 | flow = Flow(start=load_data) 655 | 656 | shared = {} 657 | flow.run(shared) 658 | ``` 659 | 660 | Here: 661 | - `LoadData` writes to `shared["data"]`. 662 | - `Summarize` reads from `shared["data"]`, summarizes, and writes to `shared["summary"]`. 663 | 664 | --- 665 | 666 | ## 2. Params 667 | 668 | **Params** let you store *per-Node* or *per-Flow* config that doesn't need to live in the shared store. They are: 669 | - **Immutable** during a Node's run cycle (i.e., they don't change mid-`prep->exec->post`). 670 | - **Set** via `set_params()`. 671 | - **Cleared** and updated each time a parent Flow calls it. 672 | 673 | > Only set the uppermost Flow params because others will be overwritten by the parent Flow. 674 | > 675 | > If you need to set child node params, see [Batch](./batch.md). 676 | {: .warning } 677 | 678 | Typically, **Params** are identifiers (e.g., file name, page number). Use them to fetch the task you assigned or write to a specific part of the shared store. 679 | 680 | ### Example 681 | 682 | ```python 683 | # 1) Create a Node that uses params 684 | class SummarizeFile(Node): 685 | def prep(self, shared): 686 | # Access the node's param 687 | filename = self.params["filename"] 688 | return shared["data"].get(filename, "") 689 | 690 | def exec(self, prep_res): 691 | prompt = f"Summarize: {prep_res}" 692 | return call_llm(prompt) 693 | 694 | def post(self, shared, prep_res, exec_res): 695 | filename = self.params["filename"] 696 | shared["summary"][filename] = exec_res 697 | return "default" 698 | 699 | # 2) Set params 700 | node = SummarizeFile() 701 | 702 | # 3) Set Node params directly (for testing) 703 | node.set_params({"filename": "doc1.txt"}) 704 | node.run(shared) 705 | 706 | # 4) Create Flow 707 | flow = Flow(start=node) 708 | 709 | # 5) Set Flow params (overwrites node params) 710 | flow.set_params({"filename": "doc2.txt"}) 711 | flow.run(shared) # The node summarizes doc2, not doc1 712 | ``` 713 | 714 | ================================================ 715 | File: docs/core_abstraction/flow.md 716 | ================================================ 717 | --- 718 | layout: default 719 | title: "Flow" 720 | parent: "Core Abstraction" 721 | nav_order: 2 722 | --- 723 | 724 | # Flow 725 | 726 | A **Flow** orchestrates a graph of Nodes. You can chain Nodes in a sequence or create branching depending on the **Actions** returned from each Node's `post()`. 727 | 728 | ## 1. Action-based Transitions 729 | 730 | Each Node's `post()` returns an **Action** string. By default, if `post()` doesn't return anything, we treat that as `"default"`. 731 | 732 | You define transitions with the syntax: 733 | 734 | 1. **Basic default transition**: `node_a >> node_b` 735 | This means if `node_a.post()` returns `"default"`, go to `node_b`. 736 | (Equivalent to `node_a - "default" >> node_b`) 737 | 738 | 2. **Named action transition**: `node_a - "action_name" >> node_b` 739 | This means if `node_a.post()` returns `"action_name"`, go to `node_b`. 740 | 741 | It's possible to create loops, branching, or multi-step flows. 742 | 743 | ## 2. Creating a Flow 744 | 745 | A **Flow** begins with a **start** node. You call `Flow(start=some_node)` to specify the entry point. When you call `flow.run(shared)`, it executes the start node, looks at its returned Action from `post()`, follows the transition, and continues until there's no next node. 746 | 747 | ### Example: Simple Sequence 748 | 749 | Here's a minimal flow of two nodes in a chain: 750 | 751 | ```python 752 | node_a >> node_b 753 | flow = Flow(start=node_a) 754 | flow.run(shared) 755 | ``` 756 | 757 | - When you run the flow, it executes `node_a`. 758 | - Suppose `node_a.post()` returns `"default"`. 759 | - The flow then sees `"default"` Action is linked to `node_b` and runs `node_b`. 760 | - `node_b.post()` returns `"default"` but we didn't define `node_b >> something_else`. So the flow ends there. 761 | 762 | ### Example: Branching & Looping 763 | 764 | Here's a simple expense approval flow that demonstrates branching and looping. The `ReviewExpense` node can return three possible Actions: 765 | 766 | - `"approved"`: expense is approved, move to payment processing 767 | - `"needs_revision"`: expense needs changes, send back for revision 768 | - `"rejected"`: expense is denied, finish the process 769 | 770 | We can wire them like this: 771 | 772 | ```python 773 | # Define the flow connections 774 | review - "approved" >> payment # If approved, process payment 775 | review - "needs_revision" >> revise # If needs changes, go to revision 776 | review - "rejected" >> finish # If rejected, finish the process 777 | 778 | revise >> review # After revision, go back for another review 779 | payment >> finish # After payment, finish the process 780 | 781 | flow = Flow(start=review) 782 | ``` 783 | 784 | Let's see how it flows: 785 | 786 | 1. If `review.post()` returns `"approved"`, the expense moves to the `payment` node 787 | 2. If `review.post()` returns `"needs_revision"`, it goes to the `revise` node, which then loops back to `review` 788 | 3. If `review.post()` returns `"rejected"`, it moves to the `finish` node and stops 789 | 790 | ```mermaid 791 | flowchart TD 792 | review[Review Expense] -->|approved| payment[Process Payment] 793 | review -->|needs_revision| revise[Revise Report] 794 | review -->|rejected| finish[Finish Process] 795 | 796 | revise --> review 797 | payment --> finish 798 | ``` 799 | 800 | ### Running Individual Nodes vs. Running a Flow 801 | 802 | - `node.run(shared)`: Just runs that node alone (calls `prep->exec->post()`), returns an Action. 803 | - `flow.run(shared)`: Executes from the start node, follows Actions to the next node, and so on until the flow can't continue. 804 | 805 | > `node.run(shared)` **does not** proceed to the successor. 806 | > This is mainly for debugging or testing a single node. 807 | > 808 | > Always use `flow.run(...)` in production to ensure the full pipeline runs correctly. 809 | {: .warning } 810 | 811 | ## 3. Nested Flows 812 | 813 | A **Flow** can act like a Node, which enables powerful composition patterns. This means you can: 814 | 815 | 1. Use a Flow as a Node within another Flow's transitions. 816 | 2. Combine multiple smaller Flows into a larger Flow for reuse. 817 | 3. Node `params` will be a merging of **all** parents' `params`. 818 | 819 | ### Flow's Node Methods 820 | 821 | A **Flow** is also a **Node**, so it will run `prep()` and `post()`. However: 822 | 823 | - It **won't** run `exec()`, as its main logic is to orchestrate its nodes. 824 | - `post()` always receives `None` for `exec_res` and should instead get the flow execution results from the shared store. 825 | 826 | ### Basic Flow Nesting 827 | 828 | Here's how to connect a flow to another node: 829 | 830 | ```python 831 | # Create a sub-flow 832 | node_a >> node_b 833 | subflow = Flow(start=node_a) 834 | 835 | # Connect it to another node 836 | subflow >> node_c 837 | 838 | # Create the parent flow 839 | parent_flow = Flow(start=subflow) 840 | ``` 841 | 842 | When `parent_flow.run()` executes: 843 | 1. It starts `subflow` 844 | 2. `subflow` runs through its nodes (`node_a->node_b`) 845 | 3. After `subflow` completes, execution continues to `node_c` 846 | 847 | ### Example: Order Processing Pipeline 848 | 849 | Here's a practical example that breaks down order processing into nested flows: 850 | 851 | ```python 852 | # Payment processing sub-flow 853 | validate_payment >> process_payment >> payment_confirmation 854 | payment_flow = Flow(start=validate_payment) 855 | 856 | # Inventory sub-flow 857 | check_stock >> reserve_items >> update_inventory 858 | inventory_flow = Flow(start=check_stock) 859 | 860 | # Shipping sub-flow 861 | create_label >> assign_carrier >> schedule_pickup 862 | shipping_flow = Flow(start=create_label) 863 | 864 | # Connect the flows into a main order pipeline 865 | payment_flow >> inventory_flow >> shipping_flow 866 | 867 | # Create the master flow 868 | order_pipeline = Flow(start=payment_flow) 869 | 870 | # Run the entire pipeline 871 | order_pipeline.run(shared_data) 872 | ``` 873 | 874 | This creates a clean separation of concerns while maintaining a clear execution path: 875 | 876 | ```mermaid 877 | flowchart LR 878 | subgraph order_pipeline[Order Pipeline] 879 | subgraph paymentFlow["Payment Flow"] 880 | A[Validate Payment] --> B[Process Payment] --> C[Payment Confirmation] 881 | end 882 | 883 | subgraph inventoryFlow["Inventory Flow"] 884 | D[Check Stock] --> E[Reserve Items] --> F[Update Inventory] 885 | end 886 | 887 | subgraph shippingFlow["Shipping Flow"] 888 | G[Create Label] --> H[Assign Carrier] --> I[Schedule Pickup] 889 | end 890 | 891 | paymentFlow --> inventoryFlow 892 | inventoryFlow --> shippingFlow 893 | end 894 | ``` 895 | 896 | ================================================ 897 | File: docs/core_abstraction/node.md 898 | ================================================ 899 | --- 900 | layout: default 901 | title: "Node" 902 | parent: "Core Abstraction" 903 | nav_order: 1 904 | --- 905 | 906 | # Node 907 | 908 | A **Node** is the smallest building block. Each Node has 3 steps `prep->exec->post`: 909 | 910 |
911 | 912 |
913 | 914 | 1. `prep(shared)` 915 | - **Read and preprocess data** from `shared` store. 916 | - Examples: *query DB, read files, or serialize data into a string*. 917 | - Return `prep_res`, which is used by `exec()` and `post()`. 918 | 919 | 2. `exec(prep_res)` 920 | - **Execute compute logic**, with optional retries and error handling (below). 921 | - Examples: *(mostly) LLM calls, remote APIs, tool use*. 922 | - ⚠️ This shall be only for compute and **NOT** access `shared`. 923 | - ⚠️ If retries enabled, ensure idempotent implementation. 924 | - ⚠️ Defer exception handling to the Node's built-in retry mechanism. 925 | - Return `exec_res`, which is passed to `post()`. 926 | 927 | 3. `post(shared, prep_res, exec_res)` 928 | - **Postprocess and write data** back to `shared`. 929 | - Examples: *update DB, change states, log results*. 930 | - **Decide the next action** by returning a *string* (`action = "default"` if *None*). 931 | 932 | > **Why 3 steps?** To enforce the principle of *separation of concerns*. The data storage and data processing are operated separately. 933 | > 934 | > All steps are *optional*. E.g., you can only implement `prep` and `post` if you just need to process data. 935 | {: .note } 936 | 937 | ### Fault Tolerance & Retries 938 | 939 | You can **retry** `exec()` if it raises an exception via two parameters when define the Node: 940 | 941 | - `max_retries` (int): Max times to run `exec()`. The default is `1` (**no** retry). 942 | - `wait` (int): The time to wait (in **seconds**) before next retry. By default, `wait=0` (no waiting). 943 | `wait` is helpful when you encounter rate-limits or quota errors from your LLM provider and need to back off. 944 | 945 | ```python 946 | my_node = SummarizeFile(max_retries=3, wait=10) 947 | ``` 948 | 949 | When an exception occurs in `exec()`, the Node automatically retries until: 950 | 951 | - It either succeeds, or 952 | - The Node has retried `max_retries - 1` times already and fails on the last attempt. 953 | 954 | You can get the current retry times (0-based) from `self.cur_retry`. 955 | 956 | ```python 957 | class RetryNode(Node): 958 | def exec(self, prep_res): 959 | print(f"Retry {self.cur_retry} times") 960 | raise Exception("Failed") 961 | ``` 962 | 963 | ### Graceful Fallback 964 | 965 | To **gracefully handle** the exception (after all retries) rather than raising it, override: 966 | 967 | ```python 968 | def exec_fallback(self, prep_res, exc): 969 | raise exc 970 | ``` 971 | 972 | By default, it just re-raises exception. But you can return a fallback result instead, which becomes the `exec_res` passed to `post()`. 973 | 974 | ### Example: Summarize file 975 | 976 | ```python 977 | class SummarizeFile(Node): 978 | def prep(self, shared): 979 | return shared["data"] 980 | 981 | def exec(self, prep_res): 982 | if not prep_res: 983 | return "Empty file content" 984 | prompt = f"Summarize this text in 10 words: {prep_res}" 985 | summary = call_llm(prompt) # might fail 986 | return summary 987 | 988 | def exec_fallback(self, prep_res, exc): 989 | # Provide a simple fallback instead of crashing 990 | return "There was an error processing your request." 991 | 992 | def post(self, shared, prep_res, exec_res): 993 | shared["summary"] = exec_res 994 | # Return "default" by not returning 995 | 996 | summarize_node = SummarizeFile(max_retries=3) 997 | 998 | # node.run() calls prep->exec->post 999 | # If exec() fails, it retries up to 3 times before calling exec_fallback() 1000 | action_result = summarize_node.run(shared) 1001 | 1002 | print("Action returned:", action_result) # "default" 1003 | print("Summary stored:", shared["summary"]) 1004 | ``` 1005 | 1006 | ================================================ 1007 | File: docs/core_abstraction/parallel.md 1008 | ================================================ 1009 | --- 1010 | layout: default 1011 | title: "(Advanced) Parallel" 1012 | parent: "Core Abstraction" 1013 | nav_order: 6 1014 | --- 1015 | 1016 | # (Advanced) Parallel 1017 | 1018 | **Parallel** Nodes and Flows let you run multiple **Async** Nodes and Flows **concurrently**—for example, summarizing multiple texts at once. This can improve performance by overlapping I/O and compute. 1019 | 1020 | > Because of Python’s GIL, parallel nodes and flows can’t truly parallelize CPU-bound tasks (e.g., heavy numerical computations). However, they excel at overlapping I/O-bound work—like LLM calls, database queries, API requests, or file I/O. 1021 | {: .warning } 1022 | 1023 | > - **Ensure Tasks Are Independent**: If each item depends on the output of a previous item, **do not** parallelize. 1024 | > 1025 | > - **Beware of Rate Limits**: Parallel calls can **quickly** trigger rate limits on LLM services. You may need a **throttling** mechanism (e.g., semaphores or sleep intervals). 1026 | > 1027 | > - **Consider Single-Node Batch APIs**: Some LLMs offer a **batch inference** API where you can send multiple prompts in a single call. This is more complex to implement but can be more efficient than launching many parallel requests and mitigates rate limits. 1028 | {: .best-practice } 1029 | 1030 | ## AsyncParallelBatchNode 1031 | 1032 | Like **AsyncBatchNode**, but run `exec_async()` in **parallel**: 1033 | 1034 | ```python 1035 | class ParallelSummaries(AsyncParallelBatchNode): 1036 | async def prep_async(self, shared): 1037 | # e.g., multiple texts 1038 | return shared["texts"] 1039 | 1040 | async def exec_async(self, text): 1041 | prompt = f"Summarize: {text}" 1042 | return await call_llm_async(prompt) 1043 | 1044 | async def post_async(self, shared, prep_res, exec_res_list): 1045 | shared["summary"] = "\n\n".join(exec_res_list) 1046 | return "default" 1047 | 1048 | node = ParallelSummaries() 1049 | flow = AsyncFlow(start=node) 1050 | ``` 1051 | 1052 | ## AsyncParallelBatchFlow 1053 | 1054 | Parallel version of **BatchFlow**. Each iteration of the sub-flow runs **concurrently** using different parameters: 1055 | 1056 | ```python 1057 | class SummarizeMultipleFiles(AsyncParallelBatchFlow): 1058 | async def prep_async(self, shared): 1059 | return [{"filename": f} for f in shared["files"]] 1060 | 1061 | sub_flow = AsyncFlow(start=LoadAndSummarizeFile()) 1062 | parallel_flow = SummarizeMultipleFiles(start=sub_flow) 1063 | await parallel_flow.run_async(shared) 1064 | ``` 1065 | 1066 | ================================================ 1067 | File: docs/design_pattern/agent.md 1068 | ================================================ 1069 | --- 1070 | layout: default 1071 | title: "Agent" 1072 | parent: "Design Pattern" 1073 | nav_order: 1 1074 | --- 1075 | 1076 | # Agent 1077 | 1078 | Agent is a powerful design pattern in which nodes can take dynamic actions based on the context. 1079 | 1080 |
1081 | 1082 |
1083 | 1084 | ## Implement Agent with Graph 1085 | 1086 | 1. **Context and Action:** Implement nodes that supply context and perform actions. 1087 | 2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step. 1088 | 3. **Agent Node:** Provide a prompt to decide action—for example: 1089 | 1090 | ```python 1091 | f""" 1092 | ### CONTEXT 1093 | Task: {task_description} 1094 | Previous Actions: {previous_actions} 1095 | Current State: {current_state} 1096 | 1097 | ### ACTION SPACE 1098 | [1] search 1099 | Description: Use web search to get results 1100 | Parameters: 1101 | - query (str): What to search for 1102 | 1103 | [2] answer 1104 | Description: Conclude based on the results 1105 | Parameters: 1106 | - result (str): Final answer to provide 1107 | 1108 | ### NEXT ACTION 1109 | Decide the next action based on the current context and available action space. 1110 | Return your response in the following format: 1111 | 1112 | ```yaml 1113 | thinking: | 1114 | 1115 | action: 1116 | parameters: 1117 | : 1118 | ```""" 1119 | ``` 1120 | 1121 | The core of building **high-performance** and **reliable** agents boils down to: 1122 | 1123 | 1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](./rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](https://arxiv.org/abs/2307.03172), overlooking mid-prompt content. 1124 | 1125 | 2. **Action Space:** Provide *a well-structured and unambiguous* set of actions—avoiding overlap like separate `read_databases` or `read_csvs`. Instead, import CSVs into the database. 1126 | 1127 | ## Example Good Action Design 1128 | 1129 | - **Incremental:** Feed content in manageable chunks (500 lines or 1 page) instead of all at once. 1130 | 1131 | - **Overview-zoom-in:** First provide high-level structure (table of contents, summary), then allow drilling into details (raw texts). 1132 | 1133 | - **Parameterized/Programmable:** Instead of fixed actions, enable parameterized (columns to select) or programmable (SQL queries) actions, for example, to read CSV files. 1134 | 1135 | - **Backtracking:** Let the agent undo the last step instead of restarting entirely, preserving progress when encountering errors or dead ends. 1136 | 1137 | ## Example: Search Agent 1138 | 1139 | This agent: 1140 | 1. Decides whether to search or answer 1141 | 2. If searches, loops back to decide if more search needed 1142 | 3. Answers when enough context gathered 1143 | 1144 | ```python 1145 | class DecideAction(Node): 1146 | def prep(self, shared): 1147 | context = shared.get("context", "No previous search") 1148 | query = shared["query"] 1149 | return query, context 1150 | 1151 | def exec(self, inputs): 1152 | query, context = inputs 1153 | prompt = f""" 1154 | Given input: {query} 1155 | Previous search results: {context} 1156 | Should I: 1) Search web for more info 2) Answer with current knowledge 1157 | Output in yaml: 1158 | ```yaml 1159 | action: search/answer 1160 | reason: why this action 1161 | search_term: search phrase if action is search 1162 | ```""" 1163 | resp = call_llm(prompt) 1164 | yaml_str = resp.split("```yaml")[1].split("```")[0].strip() 1165 | result = yaml.safe_load(yaml_str) 1166 | 1167 | assert isinstance(result, dict) 1168 | assert "action" in result 1169 | assert "reason" in result 1170 | assert result["action"] in ["search", "answer"] 1171 | if result["action"] == "search": 1172 | assert "search_term" in result 1173 | 1174 | return result 1175 | 1176 | def post(self, shared, prep_res, exec_res): 1177 | if exec_res["action"] == "search": 1178 | shared["search_term"] = exec_res["search_term"] 1179 | return exec_res["action"] 1180 | 1181 | class SearchWeb(Node): 1182 | def prep(self, shared): 1183 | return shared["search_term"] 1184 | 1185 | def exec(self, search_term): 1186 | return search_web(search_term) 1187 | 1188 | def post(self, shared, prep_res, exec_res): 1189 | prev_searches = shared.get("context", []) 1190 | shared["context"] = prev_searches + [ 1191 | {"term": shared["search_term"], "result": exec_res} 1192 | ] 1193 | return "decide" 1194 | 1195 | class DirectAnswer(Node): 1196 | def prep(self, shared): 1197 | return shared["query"], shared.get("context", "") 1198 | 1199 | def exec(self, inputs): 1200 | query, context = inputs 1201 | return call_llm(f"Context: {context}\nAnswer: {query}") 1202 | 1203 | def post(self, shared, prep_res, exec_res): 1204 | print(f"Answer: {exec_res}") 1205 | shared["answer"] = exec_res 1206 | 1207 | # Connect nodes 1208 | decide = DecideAction() 1209 | search = SearchWeb() 1210 | answer = DirectAnswer() 1211 | 1212 | decide - "search" >> search 1213 | decide - "answer" >> answer 1214 | search - "decide" >> decide # Loop back 1215 | 1216 | flow = Flow(start=decide) 1217 | flow.run({"query": "Who won the Nobel Prize in Physics 2024?"}) 1218 | ``` 1219 | 1220 | ================================================ 1221 | File: docs/design_pattern/mapreduce.md 1222 | ================================================ 1223 | --- 1224 | layout: default 1225 | title: "Map Reduce" 1226 | parent: "Design Pattern" 1227 | nav_order: 4 1228 | --- 1229 | 1230 | # Map Reduce 1231 | 1232 | MapReduce is a design pattern suitable when you have either: 1233 | - Large input data (e.g., multiple files to process), or 1234 | - Large output data (e.g., multiple forms to fill) 1235 | 1236 | and there is a logical way to break the task into smaller, ideally independent parts. 1237 | 1238 |
1239 | 1240 |
1241 | 1242 | You first break down the task using [BatchNode](../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase. 1243 | 1244 | ### Example: Document Summarization 1245 | 1246 | ```python 1247 | class SummarizeAllFiles(BatchNode): 1248 | def prep(self, shared): 1249 | files_dict = shared["files"] # e.g. 10 files 1250 | return list(files_dict.items()) # [("file1.txt", "aaa..."), ("file2.txt", "bbb..."), ...] 1251 | 1252 | def exec(self, one_file): 1253 | filename, file_content = one_file 1254 | summary_text = call_llm(f"Summarize the following file:\n{file_content}") 1255 | return (filename, summary_text) 1256 | 1257 | def post(self, shared, prep_res, exec_res_list): 1258 | shared["file_summaries"] = dict(exec_res_list) 1259 | 1260 | class CombineSummaries(Node): 1261 | def prep(self, shared): 1262 | return shared["file_summaries"] 1263 | 1264 | def exec(self, file_summaries): 1265 | # format as: "File1: summary\nFile2: summary...\n" 1266 | text_list = [] 1267 | for fname, summ in file_summaries.items(): 1268 | text_list.append(f"{fname} summary:\n{summ}\n") 1269 | big_text = "\n---\n".join(text_list) 1270 | 1271 | return call_llm(f"Combine these file summaries into one final summary:\n{big_text}") 1272 | 1273 | def post(self, shared, prep_res, final_summary): 1274 | shared["all_files_summary"] = final_summary 1275 | 1276 | batch_node = SummarizeAllFiles() 1277 | combine_node = CombineSummaries() 1278 | batch_node >> combine_node 1279 | 1280 | flow = Flow(start=batch_node) 1281 | 1282 | shared = { 1283 | "files": { 1284 | "file1.txt": "Alice was beginning to get very tired of sitting by her sister...", 1285 | "file2.txt": "Some other interesting text ...", 1286 | # ... 1287 | } 1288 | } 1289 | flow.run(shared) 1290 | print("Individual Summaries:", shared["file_summaries"]) 1291 | print("\nFinal Summary:\n", shared["all_files_summary"]) 1292 | ``` 1293 | 1294 | ================================================ 1295 | File: docs/design_pattern/rag.md 1296 | ================================================ 1297 | --- 1298 | layout: default 1299 | title: "RAG" 1300 | parent: "Design Pattern" 1301 | nav_order: 3 1302 | --- 1303 | 1304 | # RAG (Retrieval Augmented Generation) 1305 | 1306 | For certain LLM tasks like answering questions, providing relevant context is essential. One common architecture is a **two-stage** RAG pipeline: 1307 | 1308 |
1309 | 1310 |
1311 | 1312 | 1. **Offline stage**: Preprocess and index documents ("building the index"). 1313 | 2. **Online stage**: Given a question, generate answers by retrieving the most relevant context. 1314 | 1315 | --- 1316 | ## Stage 1: Offline Indexing 1317 | 1318 | We create three Nodes: 1319 | 1. `ChunkDocs` – [chunks](../utility_function/chunking.md) raw text. 1320 | 2. `EmbedDocs` – [embeds](../utility_function/embedding.md) each chunk. 1321 | 3. `StoreIndex` – stores embeddings into a [vector database](../utility_function/vector.md). 1322 | 1323 | ```python 1324 | class ChunkDocs(BatchNode): 1325 | def prep(self, shared): 1326 | # A list of file paths in shared["files"]. We process each file. 1327 | return shared["files"] 1328 | 1329 | def exec(self, filepath): 1330 | # read file content. In real usage, do error handling. 1331 | with open(filepath, "r", encoding="utf-8") as f: 1332 | text = f.read() 1333 | # chunk by 100 chars each 1334 | chunks = [] 1335 | size = 100 1336 | for i in range(0, len(text), size): 1337 | chunks.append(text[i : i + size]) 1338 | return chunks 1339 | 1340 | def post(self, shared, prep_res, exec_res_list): 1341 | # exec_res_list is a list of chunk-lists, one per file. 1342 | # flatten them all into a single list of chunks. 1343 | all_chunks = [] 1344 | for chunk_list in exec_res_list: 1345 | all_chunks.extend(chunk_list) 1346 | shared["all_chunks"] = all_chunks 1347 | 1348 | class EmbedDocs(BatchNode): 1349 | def prep(self, shared): 1350 | return shared["all_chunks"] 1351 | 1352 | def exec(self, chunk): 1353 | return get_embedding(chunk) 1354 | 1355 | def post(self, shared, prep_res, exec_res_list): 1356 | # Store the list of embeddings. 1357 | shared["all_embeds"] = exec_res_list 1358 | print(f"Total embeddings: {len(exec_res_list)}") 1359 | 1360 | class StoreIndex(Node): 1361 | def prep(self, shared): 1362 | # We'll read all embeds from shared. 1363 | return shared["all_embeds"] 1364 | 1365 | def exec(self, all_embeds): 1366 | # Create a vector index (faiss or other DB in real usage). 1367 | index = create_index(all_embeds) 1368 | return index 1369 | 1370 | def post(self, shared, prep_res, index): 1371 | shared["index"] = index 1372 | 1373 | # Wire them in sequence 1374 | chunk_node = ChunkDocs() 1375 | embed_node = EmbedDocs() 1376 | store_node = StoreIndex() 1377 | 1378 | chunk_node >> embed_node >> store_node 1379 | 1380 | OfflineFlow = Flow(start=chunk_node) 1381 | ``` 1382 | 1383 | Usage example: 1384 | 1385 | ```python 1386 | shared = { 1387 | "files": ["doc1.txt", "doc2.txt"], # any text files 1388 | } 1389 | OfflineFlow.run(shared) 1390 | ``` 1391 | 1392 | --- 1393 | ## Stage 2: Online Query & Answer 1394 | 1395 | We have 3 nodes: 1396 | 1. `EmbedQuery` – embeds the user’s question. 1397 | 2. `RetrieveDocs` – retrieves top chunk from the index. 1398 | 3. `GenerateAnswer` – calls the LLM with the question + chunk to produce the final answer. 1399 | 1400 | ```python 1401 | class EmbedQuery(Node): 1402 | def prep(self, shared): 1403 | return shared["question"] 1404 | 1405 | def exec(self, question): 1406 | return get_embedding(question) 1407 | 1408 | def post(self, shared, prep_res, q_emb): 1409 | shared["q_emb"] = q_emb 1410 | 1411 | class RetrieveDocs(Node): 1412 | def prep(self, shared): 1413 | # We'll need the query embedding, plus the offline index/chunks 1414 | return shared["q_emb"], shared["index"], shared["all_chunks"] 1415 | 1416 | def exec(self, inputs): 1417 | q_emb, index, chunks = inputs 1418 | I, D = search_index(index, q_emb, top_k=1) 1419 | best_id = I[0][0] 1420 | relevant_chunk = chunks[best_id] 1421 | return relevant_chunk 1422 | 1423 | def post(self, shared, prep_res, relevant_chunk): 1424 | shared["retrieved_chunk"] = relevant_chunk 1425 | print("Retrieved chunk:", relevant_chunk[:60], "...") 1426 | 1427 | class GenerateAnswer(Node): 1428 | def prep(self, shared): 1429 | return shared["question"], shared["retrieved_chunk"] 1430 | 1431 | def exec(self, inputs): 1432 | question, chunk = inputs 1433 | prompt = f"Question: {question}\nContext: {chunk}\nAnswer:" 1434 | return call_llm(prompt) 1435 | 1436 | def post(self, shared, prep_res, answer): 1437 | shared["answer"] = answer 1438 | print("Answer:", answer) 1439 | 1440 | embed_qnode = EmbedQuery() 1441 | retrieve_node = RetrieveDocs() 1442 | generate_node = GenerateAnswer() 1443 | 1444 | embed_qnode >> retrieve_node >> generate_node 1445 | OnlineFlow = Flow(start=embed_qnode) 1446 | ``` 1447 | 1448 | Usage example: 1449 | 1450 | ```python 1451 | # Suppose we already ran OfflineFlow and have: 1452 | # shared["all_chunks"], shared["index"], etc. 1453 | shared["question"] = "Why do people like cats?" 1454 | 1455 | OnlineFlow.run(shared) 1456 | # final answer in shared["answer"] 1457 | ``` 1458 | 1459 | ================================================ 1460 | File: docs/design_pattern/structure.md 1461 | ================================================ 1462 | --- 1463 | layout: default 1464 | title: "Structured Output" 1465 | parent: "Design Pattern" 1466 | nav_order: 5 1467 | --- 1468 | 1469 | # Structured Output 1470 | 1471 | In many use cases, you may want the LLM to output a specific structure, such as a list or a dictionary with predefined keys. 1472 | 1473 | There are several approaches to achieve a structured output: 1474 | - **Prompting** the LLM to strictly return a defined structure. 1475 | - Using LLMs that natively support **schema enforcement**. 1476 | - **Post-processing** the LLM's response to extract structured content. 1477 | 1478 | In practice, **Prompting** is simple and reliable for modern LLMs. 1479 | 1480 | ### Example Use Cases 1481 | 1482 | - Extracting Key Information 1483 | 1484 | ```yaml 1485 | product: 1486 | name: Widget Pro 1487 | price: 199.99 1488 | description: | 1489 | A high-quality widget designed for professionals. 1490 | Recommended for advanced users. 1491 | ``` 1492 | 1493 | - Summarizing Documents into Bullet Points 1494 | 1495 | ```yaml 1496 | summary: 1497 | - This product is easy to use. 1498 | - It is cost-effective. 1499 | - Suitable for all skill levels. 1500 | ``` 1501 | 1502 | - Generating Configuration Files 1503 | 1504 | ```yaml 1505 | server: 1506 | host: 127.0.0.1 1507 | port: 8080 1508 | ssl: true 1509 | ``` 1510 | 1511 | ## Prompt Engineering 1512 | 1513 | When prompting the LLM to produce **structured** output: 1514 | 1. **Wrap** the structure in code fences (e.g., `yaml`). 1515 | 2. **Validate** that all required fields exist (and let `Node` handles retry). 1516 | 1517 | ### Example Text Summarization 1518 | 1519 | ```python 1520 | class SummarizeNode(Node): 1521 | def exec(self, prep_res): 1522 | # Suppose `prep_res` is the text to summarize. 1523 | prompt = f""" 1524 | Please summarize the following text as YAML, with exactly 3 bullet points 1525 | 1526 | {prep_res} 1527 | 1528 | Now, output: 1529 | ```yaml 1530 | summary: 1531 | - bullet 1 1532 | - bullet 2 1533 | - bullet 3 1534 | ```""" 1535 | response = call_llm(prompt) 1536 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 1537 | 1538 | import yaml 1539 | structured_result = yaml.safe_load(yaml_str) 1540 | 1541 | assert "summary" in structured_result 1542 | assert isinstance(structured_result["summary"], list) 1543 | 1544 | return structured_result 1545 | ``` 1546 | 1547 | > Besides using `assert` statements, another popular way to validate schemas is [Pydantic](https://github.com/pydantic/pydantic) 1548 | {: .note } 1549 | 1550 | ### Why YAML instead of JSON? 1551 | 1552 | Current LLMs struggle with escaping. YAML is easier with strings since they don't always need quotes. 1553 | 1554 | **In JSON** 1555 | 1556 | ```json 1557 | { 1558 | "dialogue": "Alice said: \"Hello Bob.\\nHow are you?\\nI am good.\"" 1559 | } 1560 | ``` 1561 | 1562 | - Every double quote inside the string must be escaped with `\"`. 1563 | - Each newline in the dialogue must be represented as `\n`. 1564 | 1565 | **In YAML** 1566 | 1567 | ```yaml 1568 | dialogue: | 1569 | Alice said: "Hello Bob. 1570 | How are you? 1571 | I am good." 1572 | ``` 1573 | 1574 | - No need to escape interior quotes—just place the entire text under a block literal (`|`). 1575 | - Newlines are naturally preserved without needing `\n`. 1576 | 1577 | ================================================ 1578 | File: docs/design_pattern/workflow.md 1579 | ================================================ 1580 | --- 1581 | layout: default 1582 | title: "Workflow" 1583 | parent: "Design Pattern" 1584 | nav_order: 2 1585 | --- 1586 | 1587 | # Workflow 1588 | 1589 | Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](../core_abstraction/flow.md) of multiple Nodes. 1590 | 1591 |
1592 | 1593 |
1594 | 1595 | > - You don't want to make each task **too coarse**, because it may be *too complex for one LLM call*. 1596 | > - You don't want to make each task **too granular**, because then *the LLM call doesn't have enough context* and results are *not consistent across nodes*. 1597 | > 1598 | > You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](./agent.md). 1599 | {: .best-practice } 1600 | 1601 | ### Example: Article Writing 1602 | 1603 | ```python 1604 | class GenerateOutline(Node): 1605 | def prep(self, shared): return shared["topic"] 1606 | def exec(self, topic): return call_llm(f"Create a detailed outline for an article about {topic}") 1607 | def post(self, shared, prep_res, exec_res): shared["outline"] = exec_res 1608 | 1609 | class WriteSection(Node): 1610 | def prep(self, shared): return shared["outline"] 1611 | def exec(self, outline): return call_llm(f"Write content based on this outline: {outline}") 1612 | def post(self, shared, prep_res, exec_res): shared["draft"] = exec_res 1613 | 1614 | class ReviewAndRefine(Node): 1615 | def prep(self, shared): return shared["draft"] 1616 | def exec(self, draft): return call_llm(f"Review and improve this draft: {draft}") 1617 | def post(self, shared, prep_res, exec_res): shared["final_article"] = exec_res 1618 | 1619 | # Connect nodes 1620 | outline = GenerateOutline() 1621 | write = WriteSection() 1622 | review = ReviewAndRefine() 1623 | 1624 | outline >> write >> review 1625 | 1626 | # Create and run flow 1627 | writing_flow = Flow(start=outline) 1628 | shared = {"topic": "AI Safety"} 1629 | writing_flow.run(shared) 1630 | ``` 1631 | 1632 | For *dynamic cases*, consider using [Agents](./agent.md). 1633 | 1634 | ================================================ 1635 | File: docs/utility_function/llm.md 1636 | ================================================ 1637 | --- 1638 | layout: default 1639 | title: "LLM Wrapper" 1640 | parent: "Utility Function" 1641 | nav_order: 1 1642 | --- 1643 | 1644 | # LLM Wrappers 1645 | 1646 | Check out libraries like [litellm](https://github.com/BerriAI/litellm). 1647 | Here, we provide some minimal example implementations: 1648 | 1649 | 1. OpenAI 1650 | ```python 1651 | def call_llm(prompt): 1652 | from openai import OpenAI 1653 | client = OpenAI(api_key="YOUR_API_KEY_HERE") 1654 | r = client.chat.completions.create( 1655 | model="gpt-4o", 1656 | messages=[{"role": "user", "content": prompt}] 1657 | ) 1658 | return r.choices[0].message.content 1659 | 1660 | # Example usage 1661 | call_llm("How are you?") 1662 | ``` 1663 | > Store the API key in an environment variable like OPENAI_API_KEY for security. 1664 | {: .best-practice } 1665 | 1666 | 2. Claude (Anthropic) 1667 | ```python 1668 | def call_llm(prompt): 1669 | from anthropic import Anthropic 1670 | client = Anthropic(api_key="YOUR_API_KEY_HERE") 1671 | r = client.messages.create( 1672 | model="claude-sonnet-4-0", 1673 | messages=[ 1674 | {"role": "user", "content": prompt} 1675 | ] 1676 | ) 1677 | return r.content[0].text 1678 | ``` 1679 | 1680 | 3. Google (Generative AI Studio / PaLM API) 1681 | ```python 1682 | def call_llm(prompt): 1683 | from google import genai 1684 | client = genai.Client(api_key='GEMINI_API_KEY') 1685 | response = client.models.generate_content( 1686 | model='gemini-2.5-pro', 1687 | contents=prompt 1688 | ) 1689 | return response.text 1690 | ``` 1691 | 1692 | 4. Azure (Azure OpenAI) 1693 | ```python 1694 | def call_llm(prompt): 1695 | from openai import AzureOpenAI 1696 | client = AzureOpenAI( 1697 | azure_endpoint="https://.openai.azure.com/", 1698 | api_key="YOUR_API_KEY_HERE", 1699 | api_version="2023-05-15" 1700 | ) 1701 | r = client.chat.completions.create( 1702 | model="", 1703 | messages=[{"role": "user", "content": prompt}] 1704 | ) 1705 | return r.choices[0].message.content 1706 | ``` 1707 | 1708 | 5. Ollama (Local LLM) 1709 | ```python 1710 | def call_llm(prompt): 1711 | from ollama import chat 1712 | response = chat( 1713 | model="llama2", 1714 | messages=[{"role": "user", "content": prompt}] 1715 | ) 1716 | return response.message.content 1717 | ``` 1718 | 1719 | ## Improvements 1720 | Feel free to enhance your `call_llm` function as needed. Here are examples: 1721 | 1722 | - Handle chat history: 1723 | 1724 | ```python 1725 | def call_llm(messages): 1726 | from openai import OpenAI 1727 | client = OpenAI(api_key="YOUR_API_KEY_HERE") 1728 | r = client.chat.completions.create( 1729 | model="gpt-4o", 1730 | messages=messages 1731 | ) 1732 | return r.choices[0].message.content 1733 | ``` 1734 | 1735 | - Add in-memory caching 1736 | 1737 | ```python 1738 | from functools import lru_cache 1739 | 1740 | @lru_cache(maxsize=1000) 1741 | def call_llm(prompt): 1742 | # Your implementation here 1743 | pass 1744 | ``` 1745 | 1746 | > ⚠️ Caching conflicts with Node retries, as retries yield the same result. 1747 | > 1748 | > To address this, you could use cached results only if not retried. 1749 | {: .warning } 1750 | 1751 | 1752 | ```python 1753 | from functools import lru_cache 1754 | 1755 | @lru_cache(maxsize=1000) 1756 | def cached_call(prompt): 1757 | pass 1758 | 1759 | def call_llm(prompt, use_cache): 1760 | if use_cache: 1761 | return cached_call(prompt) 1762 | # Call the underlying function directly 1763 | return cached_call.__wrapped__(prompt) 1764 | 1765 | class SummarizeNode(Node): 1766 | def exec(self, text): 1767 | return call_llm(f"Summarize: {text}", self.cur_retry==0) 1768 | ``` 1769 | 1770 | - Enable logging: 1771 | 1772 | ```python 1773 | def call_llm(prompt): 1774 | import logging 1775 | logging.info(f"Prompt: {prompt}") 1776 | response = ... # Your implementation here 1777 | logging.info(f"Response: {response}") 1778 | return response 1779 | ``` -------------------------------------------------------------------------------- /.cursorrules: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: "Agentic Coding" 4 | --- 5 | 6 | # Agentic Coding: Humans Design, Agents code! 7 | 8 | > If you are an AI agent involved in building LLM Systems, read this guide **VERY, VERY** carefully! This is the most important chapter in the entire document. Throughout development, you should always (1) start with a small and simple solution, (2) design at a high level (`docs/design.md`) before implementation, and (3) frequently ask humans for feedback and clarification. 9 | {: .warning } 10 | 11 | ## Agentic Coding Steps 12 | 13 | Agentic Coding should be a collaboration between Human System Design and Agent Implementation: 14 | 15 | | Steps | Human | AI | Comment | 16 | |:-----------------------|:----------:|:---------:|:------------------------------------------------------------------------| 17 | | 1. Requirements | ★★★ High | ★☆☆ Low | Humans understand the requirements and context. | 18 | | 2. Flow | ★★☆ Medium | ★★☆ Medium | Humans specify the high-level design, and the AI fills in the details. | 19 | | 3. Utilities | ★★☆ Medium | ★★☆ Medium | Humans provide available external APIs and integrations, and the AI helps with implementation. | 20 | | 4. Data | ★☆☆ Low | ★★★ High | AI designs the data schema, and humans verify. | 21 | | 5. Node | ★☆☆ Low | ★★★ High | The AI helps design the node based on the flow. | 22 | | 6. Implementation | ★☆☆ Low | ★★★ High | The AI implements the flow based on the design. | 23 | | 7. Optimization | ★★☆ Medium | ★★☆ Medium | Humans evaluate the results, and the AI helps optimize. | 24 | | 8. Reliability | ★☆☆ Low | ★★★ High | The AI writes test cases and addresses corner cases. | 25 | 26 | 1. **Requirements**: Clarify the requirements for your project, and evaluate whether an AI system is a good fit. 27 | - Understand AI systems' strengths and limitations: 28 | - **Good for**: Routine tasks requiring common sense (filling forms, replying to emails) 29 | - **Good for**: Creative tasks with well-defined inputs (building slides, writing SQL) 30 | - **Not good for**: Ambiguous problems requiring complex decision-making (business strategy, startup planning) 31 | - **Keep It User-Centric:** Explain the "problem" from the user's perspective rather than just listing features. 32 | - **Balance complexity vs. impact**: Aim to deliver the highest value features with minimal complexity early. 33 | 34 | 2. **Flow Design**: Outline at a high level, describe how your AI system orchestrates nodes. 35 | - Identify applicable design patterns (e.g., [Map Reduce](./design_pattern/mapreduce.md), [Agent](./design_pattern/agent.md), [RAG](./design_pattern/rag.md)). 36 | - For each node in the flow, start with a high-level one-line description of what it does. 37 | - If using **Map Reduce**, specify how to map (what to split) and how to reduce (how to combine). 38 | - If using **Agent**, specify what are the inputs (context) and what are the possible actions. 39 | - If using **RAG**, specify what to embed, noting that there's usually both offline (indexing) and online (retrieval) workflows. 40 | - Outline the flow and draw it in a mermaid diagram. For example: 41 | ```mermaid 42 | flowchart LR 43 | start[Start] --> batch[Batch] 44 | batch --> check[Check] 45 | check -->|OK| process 46 | check -->|Error| fix[Fix] 47 | fix --> check 48 | 49 | subgraph process[Process] 50 | step1[Step 1] --> step2[Step 2] 51 | end 52 | 53 | process --> endNode[End] 54 | ``` 55 | - > **If Humans can't specify the flow, AI Agents can't automate it!** Before building an LLM system, thoroughly understand the problem and potential solution by manually solving example inputs to develop intuition. 56 | {: .best-practice } 57 | 58 | 3. **Utilities**: Based on the Flow Design, identify and implement necessary utility functions. 59 | - Think of your AI system as the brain. It needs a body—these *external utility functions*—to interact with the real world: 60 |
61 | 62 | - Reading inputs (e.g., retrieving Slack messages, reading emails) 63 | - Writing outputs (e.g., generating reports, sending emails) 64 | - Using external tools (e.g., calling LLMs, searching the web) 65 | - **NOTE**: *LLM-based tasks* (e.g., summarizing text, analyzing sentiment) are **NOT** utility functions; rather, they are *core functions* internal in the AI system. 66 | - For each utility function, implement it and write a simple test. 67 | - Document their input/output, as well as why they are necessary. For example: 68 | - `name`: `get_embedding` (`utils/get_embedding.py`) 69 | - `input`: `str` 70 | - `output`: a vector of 3072 floats 71 | - `necessity`: Used by the second node to embed text 72 | - Example utility implementation: 73 | ```python 74 | # utils/call_llm.py 75 | from openai import OpenAI 76 | 77 | def call_llm(prompt): 78 | client = OpenAI(api_key="YOUR_API_KEY_HERE") 79 | r = client.chat.completions.create( 80 | model="gpt-4o", 81 | messages=[{"role": "user", "content": prompt}] 82 | ) 83 | return r.choices[0].message.content 84 | 85 | if __name__ == "__main__": 86 | prompt = "What is the meaning of life?" 87 | print(call_llm(prompt)) 88 | ``` 89 | - > **Sometimes, design Utilities before Flow:** For example, for an LLM project to automate a legacy system, the bottleneck will likely be the available interface to that system. Start by designing the hardest utilities for interfacing, and then build the flow around them. 90 | {: .best-practice } 91 | - > **Avoid Exception Handling in Utilities**: If a utility function is called from a Node's `exec()` method, avoid using `try...except` blocks within the utility. Let the Node's built-in retry mechanism handle failures. 92 | {: .warning } 93 | 94 | 4. **Data Design**: Design the shared store that nodes will use to communicate. 95 | - One core design principle for PocketFlow is to use a well-designed [shared store](./core_abstraction/communication.md)—a data contract that all nodes agree upon to retrieve and store data. 96 | - For simple systems, use an in-memory dictionary. 97 | - For more complex systems or when persistence is required, use a database. 98 | - **Don't Repeat Yourself**: Use in-memory references or foreign keys. 99 | - Example shared store design: 100 | ```python 101 | shared = { 102 | "user": { 103 | "id": "user123", 104 | "context": { # Another nested dict 105 | "weather": {"temp": 72, "condition": "sunny"}, 106 | "location": "San Francisco" 107 | } 108 | }, 109 | "results": {} # Empty dict to store outputs 110 | } 111 | ``` 112 | 113 | 5. **Node Design**: Plan how each node will read and write data, and use utility functions. 114 | - For each [Node](./core_abstraction/node.md), describe its type, how it reads and writes data, and which utility function it uses. Keep it specific but high-level without codes. For example: 115 | - `type`: Regular (or Batch, or Async) 116 | - `prep`: Read "text" from the shared store 117 | - `exec`: Call the embedding utility function. **Avoid exception handling here**; let the Node's retry mechanism manage failures. 118 | - `post`: Write "embedding" to the shared store 119 | 120 | 6. **Implementation**: Implement the initial nodes and flows based on the design. 121 | - 🎉 If you've reached this step, humans have finished the design. Now *Agentic Coding* begins! 122 | - **"Keep it simple, stupid!"** Avoid complex features and full-scale type checking. 123 | - **FAIL FAST**! Leverage the built-in [Node](./core_abstraction/node.md) retry and fallback mechanisms to handle failures gracefully. This helps you quickly identify weak points in the system. 124 | - Add logging throughout the code to facilitate debugging. 125 | 126 | 7. **Optimization**: 127 | - **Use Intuition**: For a quick initial evaluation, human intuition is often a good start. 128 | - **Redesign Flow (Back to Step 3)**: Consider breaking down tasks further, introducing agentic decisions, or better managing input contexts. 129 | - If your flow design is already solid, move on to micro-optimizations: 130 | - **Prompt Engineering**: Use clear, specific instructions with examples to reduce ambiguity. 131 | - **In-Context Learning**: Provide robust examples for tasks that are difficult to specify with instructions alone. 132 | 133 | - > **You'll likely iterate a lot!** Expect to repeat Steps 3–6 hundreds of times. 134 | > 135 | >
136 | {: .best-practice } 137 | 138 | 8. **Reliability** 139 | - **Node Retries**: Add checks in the node `exec` to ensure outputs meet requirements, and consider increasing `max_retries` and `wait` times. 140 | - **Logging and Visualization**: Maintain logs of all attempts and visualize node results for easier debugging. 141 | - **Self-Evaluation**: Add a separate node (powered by an LLM) to review outputs when results are uncertain. 142 | 143 | ## Example LLM Project File Structure 144 | 145 | ``` 146 | my_project/ 147 | ├── main.py 148 | ├── nodes.py 149 | ├── flow.py 150 | ├── utils/ 151 | │ ├── __init__.py 152 | │ ├── call_llm.py 153 | │ └── search_web.py 154 | ├── requirements.txt 155 | └── docs/ 156 | └── design.md 157 | ``` 158 | 159 | - **`requirements.txt`**: Lists the Python dependencies for the project. 160 | ``` 161 | PyYAML 162 | pocketflow 163 | ``` 164 | 165 | - **`docs/design.md`**: Contains project documentation for each step above. This should be *high-level* and *no-code*. 166 | ~~~ 167 | # Design Doc: Your Project Name 168 | 169 | > Please DON'T remove notes for AI 170 | 171 | ## Requirements 172 | 173 | > Notes for AI: Keep it simple and clear. 174 | > If the requirements are abstract, write concrete user stories 175 | 176 | 177 | ## Flow Design 178 | 179 | > Notes for AI: 180 | > 1. Consider the design patterns of agent, map-reduce, rag, and workflow. Apply them if they fit. 181 | > 2. Present a concise, high-level description of the workflow. 182 | 183 | ### Applicable Design Pattern: 184 | 185 | 1. Map the file summary into chunks, then reduce these chunks into a final summary. 186 | 2. Agentic file finder 187 | - *Context*: The entire summary of the file 188 | - *Action*: Find the file 189 | 190 | ### Flow high-level Design: 191 | 192 | 1. **First Node**: This node is for ... 193 | 2. **Second Node**: This node is for ... 194 | 3. **Third Node**: This node is for ... 195 | 196 | ```mermaid 197 | flowchart TD 198 | firstNode[First Node] --> secondNode[Second Node] 199 | secondNode --> thirdNode[Third Node] 200 | ``` 201 | ## Utility Functions 202 | 203 | > Notes for AI: 204 | > 1. Understand the utility function definition thoroughly by reviewing the doc. 205 | > 2. Include only the necessary utility functions, based on nodes in the flow. 206 | 207 | 1. **Call LLM** (`utils/call_llm.py`) 208 | - *Input*: prompt (str) 209 | - *Output*: response (str) 210 | - Generally used by most nodes for LLM tasks 211 | 212 | 2. **Embedding** (`utils/get_embedding.py`) 213 | - *Input*: str 214 | - *Output*: a vector of 3072 floats 215 | - Used by the second node to embed text 216 | 217 | ## Node Design 218 | 219 | ### Shared Store 220 | 221 | > Notes for AI: Try to minimize data redundancy 222 | 223 | The shared store structure is organized as follows: 224 | 225 | ```python 226 | shared = { 227 | "key": "value" 228 | } 229 | ``` 230 | 231 | ### Node Steps 232 | 233 | > Notes for AI: Carefully decide whether to use Batch/Async Node/Flow. 234 | 235 | 1. First Node 236 | - *Purpose*: Provide a short explanation of the node’s function 237 | - *Type*: Decide between Regular, Batch, or Async 238 | - *Steps*: 239 | - *prep*: Read "key" from the shared store 240 | - *exec*: Call the utility function 241 | - *post*: Write "key" to the shared store 242 | 243 | 2. Second Node 244 | ... 245 | ~~~ 246 | 247 | 248 | - **`utils/`**: Contains all utility functions. 249 | - It's recommended to dedicate one Python file to each API call, for example `call_llm.py` or `search_web.py`. 250 | - Each file should also include a `main()` function to try that API call 251 | ```python 252 | from google import genai 253 | import os 254 | 255 | def call_llm(prompt: str) -> str: 256 | client = genai.Client( 257 | api_key=os.getenv("GEMINI_API_KEY", ""), 258 | ) 259 | model = os.getenv("GEMINI_MODEL", "gemini-2.5-flash") 260 | response = client.models.generate_content(model=model, contents=[prompt]) 261 | return response.text 262 | 263 | if __name__ == "__main__": 264 | test_prompt = "Hello, how are you?" 265 | 266 | # First call - should hit the API 267 | print("Making call...") 268 | response1 = call_llm(test_prompt, use_cache=False) 269 | print(f"Response: {response1}") 270 | ``` 271 | 272 | - **`nodes.py`**: Contains all the node definitions. 273 | ```python 274 | # nodes.py 275 | from pocketflow import Node 276 | from utils.call_llm import call_llm 277 | 278 | class GetQuestionNode(Node): 279 | def exec(self, _): 280 | # Get question directly from user input 281 | user_question = input("Enter your question: ") 282 | return user_question 283 | 284 | def post(self, shared, prep_res, exec_res): 285 | # Store the user's question 286 | shared["question"] = exec_res 287 | return "default" # Go to the next node 288 | 289 | class AnswerNode(Node): 290 | def prep(self, shared): 291 | # Read question from shared 292 | return shared["question"] 293 | 294 | def exec(self, question): 295 | # Call LLM to get the answer 296 | return call_llm(question) 297 | 298 | def post(self, shared, prep_res, exec_res): 299 | # Store the answer in shared 300 | shared["answer"] = exec_res 301 | ``` 302 | - **`flow.py`**: Implements functions that create flows by importing node definitions and connecting them. 303 | ```python 304 | # flow.py 305 | from pocketflow import Flow 306 | from nodes import GetQuestionNode, AnswerNode 307 | 308 | def create_qa_flow(): 309 | """Create and return a question-answering flow.""" 310 | # Create nodes 311 | get_question_node = GetQuestionNode() 312 | answer_node = AnswerNode() 313 | 314 | # Connect nodes in sequence 315 | get_question_node >> answer_node 316 | 317 | # Create flow starting with input node 318 | return Flow(start=get_question_node) 319 | ``` 320 | - **`main.py`**: Serves as the project's entry point. 321 | ```python 322 | # main.py 323 | from flow import create_qa_flow 324 | 325 | # Example main function 326 | # Please replace this with your own main function 327 | def main(): 328 | shared = { 329 | "question": None, # Will be populated by GetQuestionNode from user input 330 | "answer": None # Will be populated by AnswerNode 331 | } 332 | 333 | # Create the flow and run it 334 | qa_flow = create_qa_flow() 335 | qa_flow.run(shared) 336 | print(f"Question: {shared['question']}") 337 | print(f"Answer: {shared['answer']}") 338 | 339 | if __name__ == "__main__": 340 | main() 341 | ``` 342 | 343 | ================================================ 344 | File: docs/index.md 345 | ================================================ 346 | --- 347 | layout: default 348 | title: "Home" 349 | nav_order: 1 350 | --- 351 | 352 | # Pocket Flow 353 | 354 | A [100-line](https://github.com/the-pocket/PocketFlow/blob/main/pocketflow/__init__.py) minimalist LLM framework for *Agents, Task Decomposition, RAG, etc*. 355 | 356 | - **Lightweight**: Just the core graph abstraction in 100 lines. ZERO dependencies, and vendor lock-in. 357 | - **Expressive**: Everything you love from larger frameworks—([Multi-](./design_pattern/multi_agent.html))[Agents](./design_pattern/agent.html), [Workflow](./design_pattern/workflow.html), [RAG](./design_pattern/rag.html), and more. 358 | - **Agentic-Coding**: Intuitive enough for AI agents to help humans build complex LLM applications. 359 | 360 |
361 | 362 |
363 | 364 | ## Core Abstraction 365 | 366 | We model the LLM workflow as a **Graph + Shared Store**: 367 | 368 | - [Node](./core_abstraction/node.md) handles simple (LLM) tasks. 369 | - [Flow](./core_abstraction/flow.md) connects nodes through **Actions** (labeled edges). 370 | - [Shared Store](./core_abstraction/communication.md) enables communication between nodes within flows. 371 | - [Batch](./core_abstraction/batch.md) nodes/flows allow for data-intensive tasks. 372 | - [Async](./core_abstraction/async.md) nodes/flows allow waiting for asynchronous tasks. 373 | - [(Advanced) Parallel](./core_abstraction/parallel.md) nodes/flows handle I/O-bound tasks. 374 | 375 |
376 | 377 |
378 | 379 | ## Design Pattern 380 | 381 | From there, it’s easy to implement popular design patterns: 382 | 383 | - [Agent](./design_pattern/agent.md) autonomously makes decisions. 384 | - [Workflow](./design_pattern/workflow.md) chains multiple tasks into pipelines. 385 | - [RAG](./design_pattern/rag.md) integrates data retrieval with generation. 386 | - [Map Reduce](./design_pattern/mapreduce.md) splits data tasks into Map and Reduce steps. 387 | - [Structured Output](./design_pattern/structure.md) formats outputs consistently. 388 | - [(Advanced) Multi-Agents](./design_pattern/multi_agent.md) coordinate multiple agents. 389 | 390 |
391 | 392 |
393 | 394 | ## Utility Function 395 | 396 | We **do not** provide built-in utilities. Instead, we offer *examples*—please *implement your own*: 397 | 398 | - [LLM Wrapper](./utility_function/llm.md) 399 | - [Viz and Debug](./utility_function/viz.md) 400 | - [Web Search](./utility_function/websearch.md) 401 | - [Chunking](./utility_function/chunking.md) 402 | - [Embedding](./utility_function/embedding.md) 403 | - [Vector Databases](./utility_function/vector.md) 404 | - [Text-to-Speech](./utility_function/text_to_speech.md) 405 | 406 | **Why not built-in?**: I believe it's a *bad practice* for vendor-specific APIs in a general framework: 407 | - *API Volatility*: Frequent changes lead to heavy maintenance for hardcoded APIs. 408 | - *Flexibility*: You may want to switch vendors, use fine-tuned models, or run them locally. 409 | - *Optimizations*: Prompt caching, batching, and streaming are easier without vendor lock-in. 410 | 411 | ## Ready to build your Apps? 412 | 413 | Check out [Agentic Coding Guidance](./guide.md), the fastest way to develop LLM projects with Pocket Flow! 414 | 415 | ================================================ 416 | File: docs/core_abstraction/async.md 417 | ================================================ 418 | --- 419 | layout: default 420 | title: "(Advanced) Async" 421 | parent: "Core Abstraction" 422 | nav_order: 5 423 | --- 424 | 425 | # (Advanced) Async 426 | 427 | **Async** Nodes implement `prep_async()`, `exec_async()`, `exec_fallback_async()`, and/or `post_async()`. This is useful for: 428 | 429 | 1. **prep_async()**: For *fetching/reading data (files, APIs, DB)* in an I/O-friendly way. 430 | 2. **exec_async()**: Typically used for async LLM calls. 431 | 3. **post_async()**: For *awaiting user feedback*, *coordinating across multi-agents* or any additional async steps after `exec_async()`. 432 | 433 | **Note**: `AsyncNode` must be wrapped in `AsyncFlow`. `AsyncFlow` can also include regular (sync) nodes. 434 | 435 | ### Example 436 | 437 | ```python 438 | class SummarizeThenVerify(AsyncNode): 439 | async def prep_async(self, shared): 440 | # Example: read a file asynchronously 441 | doc_text = await read_file_async(shared["doc_path"]) 442 | return doc_text 443 | 444 | async def exec_async(self, prep_res): 445 | # Example: async LLM call 446 | summary = await call_llm_async(f"Summarize: {prep_res}") 447 | return summary 448 | 449 | async def post_async(self, shared, prep_res, exec_res): 450 | # Example: wait for user feedback 451 | decision = await gather_user_feedback(exec_res) 452 | if decision == "approve": 453 | shared["summary"] = exec_res 454 | return "approve" 455 | return "deny" 456 | 457 | summarize_node = SummarizeThenVerify() 458 | final_node = Finalize() 459 | 460 | # Define transitions 461 | summarize_node - "approve" >> final_node 462 | summarize_node - "deny" >> summarize_node # retry 463 | 464 | flow = AsyncFlow(start=summarize_node) 465 | 466 | async def main(): 467 | shared = {"doc_path": "document.txt"} 468 | await flow.run_async(shared) 469 | print("Final Summary:", shared.get("summary")) 470 | 471 | asyncio.run(main()) 472 | ``` 473 | 474 | ================================================ 475 | File: docs/core_abstraction/batch.md 476 | ================================================ 477 | --- 478 | layout: default 479 | title: "Batch" 480 | parent: "Core Abstraction" 481 | nav_order: 4 482 | --- 483 | 484 | # Batch 485 | 486 | **Batch** makes it easier to handle large inputs in one Node or **rerun** a Flow multiple times. Example use cases: 487 | - **Chunk-based** processing (e.g., splitting large texts). 488 | - **Iterative** processing over lists of input items (e.g., user queries, files, URLs). 489 | 490 | ## 1. BatchNode 491 | 492 | A **BatchNode** extends `Node` but changes `prep()` and `exec()`: 493 | 494 | - **`prep(shared)`**: returns an **iterable** (e.g., list, generator). 495 | - **`exec(item)`**: called **once** per item in that iterable. 496 | - **`post(shared, prep_res, exec_res_list)`**: after all items are processed, receives a **list** of results (`exec_res_list`) and returns an **Action**. 497 | 498 | 499 | ### Example: Summarize a Large File 500 | 501 | ```python 502 | class MapSummaries(BatchNode): 503 | def prep(self, shared): 504 | # Suppose we have a big file; chunk it 505 | content = shared["data"] 506 | chunk_size = 10000 507 | chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] 508 | return chunks 509 | 510 | def exec(self, chunk): 511 | prompt = f"Summarize this chunk in 10 words: {chunk}" 512 | summary = call_llm(prompt) 513 | return summary 514 | 515 | def post(self, shared, prep_res, exec_res_list): 516 | combined = "\n".join(exec_res_list) 517 | shared["summary"] = combined 518 | return "default" 519 | 520 | map_summaries = MapSummaries() 521 | flow = Flow(start=map_summaries) 522 | flow.run(shared) 523 | ``` 524 | 525 | --- 526 | 527 | ## 2. BatchFlow 528 | 529 | A **BatchFlow** runs a **Flow** multiple times, each time with different `params`. Think of it as a loop that replays the Flow for each parameter set. 530 | 531 | ### Example: Summarize Many Files 532 | 533 | ```python 534 | class SummarizeAllFiles(BatchFlow): 535 | def prep(self, shared): 536 | # Return a list of param dicts (one per file) 537 | filenames = list(shared["data"].keys()) # e.g., ["file1.txt", "file2.txt", ...] 538 | return [{"filename": fn} for fn in filenames] 539 | 540 | # Suppose we have a per-file Flow (e.g., load_file >> summarize >> reduce): 541 | summarize_file = SummarizeFile(start=load_file) 542 | 543 | # Wrap that flow into a BatchFlow: 544 | summarize_all_files = SummarizeAllFiles(start=summarize_file) 545 | summarize_all_files.run(shared) 546 | ``` 547 | 548 | ### Under the Hood 549 | 1. `prep(shared)` returns a list of param dicts—e.g., `[{filename: "file1.txt"}, {filename: "file2.txt"}, ...]`. 550 | 2. The **BatchFlow** loops through each dict. For each one: 551 | - It merges the dict with the BatchFlow’s own `params`. 552 | - It calls `flow.run(shared)` using the merged result. 553 | 3. This means the sub-Flow is run **repeatedly**, once for every param dict. 554 | 555 | --- 556 | 557 | ## 3. Nested or Multi-Level Batches 558 | 559 | You can nest a **BatchFlow** in another **BatchFlow**. For instance: 560 | - **Outer** batch: returns a list of diretory param dicts (e.g., `{"directory": "/pathA"}`, `{"directory": "/pathB"}`, ...). 561 | - **Inner** batch: returning a list of per-file param dicts. 562 | 563 | At each level, **BatchFlow** merges its own param dict with the parent’s. By the time you reach the **innermost** node, the final `params` is the merged result of **all** parents in the chain. This way, a nested structure can keep track of the entire context (e.g., directory + file name) at once. 564 | 565 | ```python 566 | 567 | class FileBatchFlow(BatchFlow): 568 | def prep(self, shared): 569 | directory = self.params["directory"] 570 | # e.g., files = ["file1.txt", "file2.txt", ...] 571 | files = [f for f in os.listdir(directory) if f.endswith(".txt")] 572 | return [{"filename": f} for f in files] 573 | 574 | class DirectoryBatchFlow(BatchFlow): 575 | def prep(self, shared): 576 | directories = [ "/path/to/dirA", "/path/to/dirB"] 577 | return [{"directory": d} for d in directories] 578 | 579 | # MapSummaries have params like {"directory": "/path/to/dirA", "filename": "file1.txt"} 580 | inner_flow = FileBatchFlow(start=MapSummaries()) 581 | outer_flow = DirectoryBatchFlow(start=inner_flow) 582 | ``` 583 | 584 | ================================================ 585 | File: docs/core_abstraction/communication.md 586 | ================================================ 587 | --- 588 | layout: default 589 | title: "Communication" 590 | parent: "Core Abstraction" 591 | nav_order: 3 592 | --- 593 | 594 | # Communication 595 | 596 | Nodes and Flows **communicate** in 2 ways: 597 | 598 | 1. **Shared Store (for almost all the cases)** 599 | 600 | - A global data structure (often an in-mem dict) that all nodes can read ( `prep()`) and write (`post()`). 601 | - Great for data results, large content, or anything multiple nodes need. 602 | - You shall design the data structure and populate it ahead. 603 | 604 | - > **Separation of Concerns:** Use `Shared Store` for almost all cases to separate *Data Schema* from *Compute Logic*! This approach is both flexible and easy to manage, resulting in more maintainable code. `Params` is more a syntax sugar for [Batch](./batch.md). 605 | {: .best-practice } 606 | 607 | 2. **Params (only for [Batch](./batch.md))** 608 | - Each node has a local, ephemeral `params` dict passed in by the **parent Flow**, used as an identifier for tasks. Parameter keys and values shall be **immutable**. 609 | - Good for identifiers like filenames or numeric IDs, in Batch mode. 610 | 611 | If you know memory management, think of the **Shared Store** like a **heap** (shared by all function calls), and **Params** like a **stack** (assigned by the caller). 612 | 613 | --- 614 | 615 | ## 1. Shared Store 616 | 617 | ### Overview 618 | 619 | A shared store is typically an in-mem dictionary, like: 620 | ```python 621 | shared = {"data": {}, "summary": {}, "config": {...}, ...} 622 | ``` 623 | 624 | It can also contain local file handlers, DB connections, or a combination for persistence. We recommend deciding the data structure or DB schema first based on your app requirements. 625 | 626 | ### Example 627 | 628 | ```python 629 | class LoadData(Node): 630 | def post(self, shared, prep_res, exec_res): 631 | # We write data to shared store 632 | shared["data"] = "Some text content" 633 | return None 634 | 635 | class Summarize(Node): 636 | def prep(self, shared): 637 | # We read data from shared store 638 | return shared["data"] 639 | 640 | def exec(self, prep_res): 641 | # Call LLM to summarize 642 | prompt = f"Summarize: {prep_res}" 643 | summary = call_llm(prompt) 644 | return summary 645 | 646 | def post(self, shared, prep_res, exec_res): 647 | # We write summary to shared store 648 | shared["summary"] = exec_res 649 | return "default" 650 | 651 | load_data = LoadData() 652 | summarize = Summarize() 653 | load_data >> summarize 654 | flow = Flow(start=load_data) 655 | 656 | shared = {} 657 | flow.run(shared) 658 | ``` 659 | 660 | Here: 661 | - `LoadData` writes to `shared["data"]`. 662 | - `Summarize` reads from `shared["data"]`, summarizes, and writes to `shared["summary"]`. 663 | 664 | --- 665 | 666 | ## 2. Params 667 | 668 | **Params** let you store *per-Node* or *per-Flow* config that doesn't need to live in the shared store. They are: 669 | - **Immutable** during a Node's run cycle (i.e., they don't change mid-`prep->exec->post`). 670 | - **Set** via `set_params()`. 671 | - **Cleared** and updated each time a parent Flow calls it. 672 | 673 | > Only set the uppermost Flow params because others will be overwritten by the parent Flow. 674 | > 675 | > If you need to set child node params, see [Batch](./batch.md). 676 | {: .warning } 677 | 678 | Typically, **Params** are identifiers (e.g., file name, page number). Use them to fetch the task you assigned or write to a specific part of the shared store. 679 | 680 | ### Example 681 | 682 | ```python 683 | # 1) Create a Node that uses params 684 | class SummarizeFile(Node): 685 | def prep(self, shared): 686 | # Access the node's param 687 | filename = self.params["filename"] 688 | return shared["data"].get(filename, "") 689 | 690 | def exec(self, prep_res): 691 | prompt = f"Summarize: {prep_res}" 692 | return call_llm(prompt) 693 | 694 | def post(self, shared, prep_res, exec_res): 695 | filename = self.params["filename"] 696 | shared["summary"][filename] = exec_res 697 | return "default" 698 | 699 | # 2) Set params 700 | node = SummarizeFile() 701 | 702 | # 3) Set Node params directly (for testing) 703 | node.set_params({"filename": "doc1.txt"}) 704 | node.run(shared) 705 | 706 | # 4) Create Flow 707 | flow = Flow(start=node) 708 | 709 | # 5) Set Flow params (overwrites node params) 710 | flow.set_params({"filename": "doc2.txt"}) 711 | flow.run(shared) # The node summarizes doc2, not doc1 712 | ``` 713 | 714 | ================================================ 715 | File: docs/core_abstraction/flow.md 716 | ================================================ 717 | --- 718 | layout: default 719 | title: "Flow" 720 | parent: "Core Abstraction" 721 | nav_order: 2 722 | --- 723 | 724 | # Flow 725 | 726 | A **Flow** orchestrates a graph of Nodes. You can chain Nodes in a sequence or create branching depending on the **Actions** returned from each Node's `post()`. 727 | 728 | ## 1. Action-based Transitions 729 | 730 | Each Node's `post()` returns an **Action** string. By default, if `post()` doesn't return anything, we treat that as `"default"`. 731 | 732 | You define transitions with the syntax: 733 | 734 | 1. **Basic default transition**: `node_a >> node_b` 735 | This means if `node_a.post()` returns `"default"`, go to `node_b`. 736 | (Equivalent to `node_a - "default" >> node_b`) 737 | 738 | 2. **Named action transition**: `node_a - "action_name" >> node_b` 739 | This means if `node_a.post()` returns `"action_name"`, go to `node_b`. 740 | 741 | It's possible to create loops, branching, or multi-step flows. 742 | 743 | ## 2. Creating a Flow 744 | 745 | A **Flow** begins with a **start** node. You call `Flow(start=some_node)` to specify the entry point. When you call `flow.run(shared)`, it executes the start node, looks at its returned Action from `post()`, follows the transition, and continues until there's no next node. 746 | 747 | ### Example: Simple Sequence 748 | 749 | Here's a minimal flow of two nodes in a chain: 750 | 751 | ```python 752 | node_a >> node_b 753 | flow = Flow(start=node_a) 754 | flow.run(shared) 755 | ``` 756 | 757 | - When you run the flow, it executes `node_a`. 758 | - Suppose `node_a.post()` returns `"default"`. 759 | - The flow then sees `"default"` Action is linked to `node_b` and runs `node_b`. 760 | - `node_b.post()` returns `"default"` but we didn't define `node_b >> something_else`. So the flow ends there. 761 | 762 | ### Example: Branching & Looping 763 | 764 | Here's a simple expense approval flow that demonstrates branching and looping. The `ReviewExpense` node can return three possible Actions: 765 | 766 | - `"approved"`: expense is approved, move to payment processing 767 | - `"needs_revision"`: expense needs changes, send back for revision 768 | - `"rejected"`: expense is denied, finish the process 769 | 770 | We can wire them like this: 771 | 772 | ```python 773 | # Define the flow connections 774 | review - "approved" >> payment # If approved, process payment 775 | review - "needs_revision" >> revise # If needs changes, go to revision 776 | review - "rejected" >> finish # If rejected, finish the process 777 | 778 | revise >> review # After revision, go back for another review 779 | payment >> finish # After payment, finish the process 780 | 781 | flow = Flow(start=review) 782 | ``` 783 | 784 | Let's see how it flows: 785 | 786 | 1. If `review.post()` returns `"approved"`, the expense moves to the `payment` node 787 | 2. If `review.post()` returns `"needs_revision"`, it goes to the `revise` node, which then loops back to `review` 788 | 3. If `review.post()` returns `"rejected"`, it moves to the `finish` node and stops 789 | 790 | ```mermaid 791 | flowchart TD 792 | review[Review Expense] -->|approved| payment[Process Payment] 793 | review -->|needs_revision| revise[Revise Report] 794 | review -->|rejected| finish[Finish Process] 795 | 796 | revise --> review 797 | payment --> finish 798 | ``` 799 | 800 | ### Running Individual Nodes vs. Running a Flow 801 | 802 | - `node.run(shared)`: Just runs that node alone (calls `prep->exec->post()`), returns an Action. 803 | - `flow.run(shared)`: Executes from the start node, follows Actions to the next node, and so on until the flow can't continue. 804 | 805 | > `node.run(shared)` **does not** proceed to the successor. 806 | > This is mainly for debugging or testing a single node. 807 | > 808 | > Always use `flow.run(...)` in production to ensure the full pipeline runs correctly. 809 | {: .warning } 810 | 811 | ## 3. Nested Flows 812 | 813 | A **Flow** can act like a Node, which enables powerful composition patterns. This means you can: 814 | 815 | 1. Use a Flow as a Node within another Flow's transitions. 816 | 2. Combine multiple smaller Flows into a larger Flow for reuse. 817 | 3. Node `params` will be a merging of **all** parents' `params`. 818 | 819 | ### Flow's Node Methods 820 | 821 | A **Flow** is also a **Node**, so it will run `prep()` and `post()`. However: 822 | 823 | - It **won't** run `exec()`, as its main logic is to orchestrate its nodes. 824 | - `post()` always receives `None` for `exec_res` and should instead get the flow execution results from the shared store. 825 | 826 | ### Basic Flow Nesting 827 | 828 | Here's how to connect a flow to another node: 829 | 830 | ```python 831 | # Create a sub-flow 832 | node_a >> node_b 833 | subflow = Flow(start=node_a) 834 | 835 | # Connect it to another node 836 | subflow >> node_c 837 | 838 | # Create the parent flow 839 | parent_flow = Flow(start=subflow) 840 | ``` 841 | 842 | When `parent_flow.run()` executes: 843 | 1. It starts `subflow` 844 | 2. `subflow` runs through its nodes (`node_a->node_b`) 845 | 3. After `subflow` completes, execution continues to `node_c` 846 | 847 | ### Example: Order Processing Pipeline 848 | 849 | Here's a practical example that breaks down order processing into nested flows: 850 | 851 | ```python 852 | # Payment processing sub-flow 853 | validate_payment >> process_payment >> payment_confirmation 854 | payment_flow = Flow(start=validate_payment) 855 | 856 | # Inventory sub-flow 857 | check_stock >> reserve_items >> update_inventory 858 | inventory_flow = Flow(start=check_stock) 859 | 860 | # Shipping sub-flow 861 | create_label >> assign_carrier >> schedule_pickup 862 | shipping_flow = Flow(start=create_label) 863 | 864 | # Connect the flows into a main order pipeline 865 | payment_flow >> inventory_flow >> shipping_flow 866 | 867 | # Create the master flow 868 | order_pipeline = Flow(start=payment_flow) 869 | 870 | # Run the entire pipeline 871 | order_pipeline.run(shared_data) 872 | ``` 873 | 874 | This creates a clean separation of concerns while maintaining a clear execution path: 875 | 876 | ```mermaid 877 | flowchart LR 878 | subgraph order_pipeline[Order Pipeline] 879 | subgraph paymentFlow["Payment Flow"] 880 | A[Validate Payment] --> B[Process Payment] --> C[Payment Confirmation] 881 | end 882 | 883 | subgraph inventoryFlow["Inventory Flow"] 884 | D[Check Stock] --> E[Reserve Items] --> F[Update Inventory] 885 | end 886 | 887 | subgraph shippingFlow["Shipping Flow"] 888 | G[Create Label] --> H[Assign Carrier] --> I[Schedule Pickup] 889 | end 890 | 891 | paymentFlow --> inventoryFlow 892 | inventoryFlow --> shippingFlow 893 | end 894 | ``` 895 | 896 | ================================================ 897 | File: docs/core_abstraction/node.md 898 | ================================================ 899 | --- 900 | layout: default 901 | title: "Node" 902 | parent: "Core Abstraction" 903 | nav_order: 1 904 | --- 905 | 906 | # Node 907 | 908 | A **Node** is the smallest building block. Each Node has 3 steps `prep->exec->post`: 909 | 910 |
911 | 912 |
913 | 914 | 1. `prep(shared)` 915 | - **Read and preprocess data** from `shared` store. 916 | - Examples: *query DB, read files, or serialize data into a string*. 917 | - Return `prep_res`, which is used by `exec()` and `post()`. 918 | 919 | 2. `exec(prep_res)` 920 | - **Execute compute logic**, with optional retries and error handling (below). 921 | - Examples: *(mostly) LLM calls, remote APIs, tool use*. 922 | - ⚠️ This shall be only for compute and **NOT** access `shared`. 923 | - ⚠️ If retries enabled, ensure idempotent implementation. 924 | - ⚠️ Defer exception handling to the Node's built-in retry mechanism. 925 | - Return `exec_res`, which is passed to `post()`. 926 | 927 | 3. `post(shared, prep_res, exec_res)` 928 | - **Postprocess and write data** back to `shared`. 929 | - Examples: *update DB, change states, log results*. 930 | - **Decide the next action** by returning a *string* (`action = "default"` if *None*). 931 | 932 | > **Why 3 steps?** To enforce the principle of *separation of concerns*. The data storage and data processing are operated separately. 933 | > 934 | > All steps are *optional*. E.g., you can only implement `prep` and `post` if you just need to process data. 935 | {: .note } 936 | 937 | ### Fault Tolerance & Retries 938 | 939 | You can **retry** `exec()` if it raises an exception via two parameters when define the Node: 940 | 941 | - `max_retries` (int): Max times to run `exec()`. The default is `1` (**no** retry). 942 | - `wait` (int): The time to wait (in **seconds**) before next retry. By default, `wait=0` (no waiting). 943 | `wait` is helpful when you encounter rate-limits or quota errors from your LLM provider and need to back off. 944 | 945 | ```python 946 | my_node = SummarizeFile(max_retries=3, wait=10) 947 | ``` 948 | 949 | When an exception occurs in `exec()`, the Node automatically retries until: 950 | 951 | - It either succeeds, or 952 | - The Node has retried `max_retries - 1` times already and fails on the last attempt. 953 | 954 | You can get the current retry times (0-based) from `self.cur_retry`. 955 | 956 | ```python 957 | class RetryNode(Node): 958 | def exec(self, prep_res): 959 | print(f"Retry {self.cur_retry} times") 960 | raise Exception("Failed") 961 | ``` 962 | 963 | ### Graceful Fallback 964 | 965 | To **gracefully handle** the exception (after all retries) rather than raising it, override: 966 | 967 | ```python 968 | def exec_fallback(self, prep_res, exc): 969 | raise exc 970 | ``` 971 | 972 | By default, it just re-raises exception. But you can return a fallback result instead, which becomes the `exec_res` passed to `post()`. 973 | 974 | ### Example: Summarize file 975 | 976 | ```python 977 | class SummarizeFile(Node): 978 | def prep(self, shared): 979 | return shared["data"] 980 | 981 | def exec(self, prep_res): 982 | if not prep_res: 983 | return "Empty file content" 984 | prompt = f"Summarize this text in 10 words: {prep_res}" 985 | summary = call_llm(prompt) # might fail 986 | return summary 987 | 988 | def exec_fallback(self, prep_res, exc): 989 | # Provide a simple fallback instead of crashing 990 | return "There was an error processing your request." 991 | 992 | def post(self, shared, prep_res, exec_res): 993 | shared["summary"] = exec_res 994 | # Return "default" by not returning 995 | 996 | summarize_node = SummarizeFile(max_retries=3) 997 | 998 | # node.run() calls prep->exec->post 999 | # If exec() fails, it retries up to 3 times before calling exec_fallback() 1000 | action_result = summarize_node.run(shared) 1001 | 1002 | print("Action returned:", action_result) # "default" 1003 | print("Summary stored:", shared["summary"]) 1004 | ``` 1005 | 1006 | ================================================ 1007 | File: docs/core_abstraction/parallel.md 1008 | ================================================ 1009 | --- 1010 | layout: default 1011 | title: "(Advanced) Parallel" 1012 | parent: "Core Abstraction" 1013 | nav_order: 6 1014 | --- 1015 | 1016 | # (Advanced) Parallel 1017 | 1018 | **Parallel** Nodes and Flows let you run multiple **Async** Nodes and Flows **concurrently**—for example, summarizing multiple texts at once. This can improve performance by overlapping I/O and compute. 1019 | 1020 | > Because of Python’s GIL, parallel nodes and flows can’t truly parallelize CPU-bound tasks (e.g., heavy numerical computations). However, they excel at overlapping I/O-bound work—like LLM calls, database queries, API requests, or file I/O. 1021 | {: .warning } 1022 | 1023 | > - **Ensure Tasks Are Independent**: If each item depends on the output of a previous item, **do not** parallelize. 1024 | > 1025 | > - **Beware of Rate Limits**: Parallel calls can **quickly** trigger rate limits on LLM services. You may need a **throttling** mechanism (e.g., semaphores or sleep intervals). 1026 | > 1027 | > - **Consider Single-Node Batch APIs**: Some LLMs offer a **batch inference** API where you can send multiple prompts in a single call. This is more complex to implement but can be more efficient than launching many parallel requests and mitigates rate limits. 1028 | {: .best-practice } 1029 | 1030 | ## AsyncParallelBatchNode 1031 | 1032 | Like **AsyncBatchNode**, but run `exec_async()` in **parallel**: 1033 | 1034 | ```python 1035 | class ParallelSummaries(AsyncParallelBatchNode): 1036 | async def prep_async(self, shared): 1037 | # e.g., multiple texts 1038 | return shared["texts"] 1039 | 1040 | async def exec_async(self, text): 1041 | prompt = f"Summarize: {text}" 1042 | return await call_llm_async(prompt) 1043 | 1044 | async def post_async(self, shared, prep_res, exec_res_list): 1045 | shared["summary"] = "\n\n".join(exec_res_list) 1046 | return "default" 1047 | 1048 | node = ParallelSummaries() 1049 | flow = AsyncFlow(start=node) 1050 | ``` 1051 | 1052 | ## AsyncParallelBatchFlow 1053 | 1054 | Parallel version of **BatchFlow**. Each iteration of the sub-flow runs **concurrently** using different parameters: 1055 | 1056 | ```python 1057 | class SummarizeMultipleFiles(AsyncParallelBatchFlow): 1058 | async def prep_async(self, shared): 1059 | return [{"filename": f} for f in shared["files"]] 1060 | 1061 | sub_flow = AsyncFlow(start=LoadAndSummarizeFile()) 1062 | parallel_flow = SummarizeMultipleFiles(start=sub_flow) 1063 | await parallel_flow.run_async(shared) 1064 | ``` 1065 | 1066 | ================================================ 1067 | File: docs/design_pattern/agent.md 1068 | ================================================ 1069 | --- 1070 | layout: default 1071 | title: "Agent" 1072 | parent: "Design Pattern" 1073 | nav_order: 1 1074 | --- 1075 | 1076 | # Agent 1077 | 1078 | Agent is a powerful design pattern in which nodes can take dynamic actions based on the context. 1079 | 1080 |
1081 | 1082 |
1083 | 1084 | ## Implement Agent with Graph 1085 | 1086 | 1. **Context and Action:** Implement nodes that supply context and perform actions. 1087 | 2. **Branching:** Use branching to connect each action node to an agent node. Use action to allow the agent to direct the [flow](../core_abstraction/flow.md) between nodes—and potentially loop back for multi-step. 1088 | 3. **Agent Node:** Provide a prompt to decide action—for example: 1089 | 1090 | ```python 1091 | f""" 1092 | ### CONTEXT 1093 | Task: {task_description} 1094 | Previous Actions: {previous_actions} 1095 | Current State: {current_state} 1096 | 1097 | ### ACTION SPACE 1098 | [1] search 1099 | Description: Use web search to get results 1100 | Parameters: 1101 | - query (str): What to search for 1102 | 1103 | [2] answer 1104 | Description: Conclude based on the results 1105 | Parameters: 1106 | - result (str): Final answer to provide 1107 | 1108 | ### NEXT ACTION 1109 | Decide the next action based on the current context and available action space. 1110 | Return your response in the following format: 1111 | 1112 | ```yaml 1113 | thinking: | 1114 | 1115 | action: 1116 | parameters: 1117 | : 1118 | ```""" 1119 | ``` 1120 | 1121 | The core of building **high-performance** and **reliable** agents boils down to: 1122 | 1123 | 1. **Context Management:** Provide *relevant, minimal context.* For example, rather than including an entire chat history, retrieve the most relevant via [RAG](./rag.md). Even with larger context windows, LLMs still fall victim to ["lost in the middle"](https://arxiv.org/abs/2307.03172), overlooking mid-prompt content. 1124 | 1125 | 2. **Action Space:** Provide *a well-structured and unambiguous* set of actions—avoiding overlap like separate `read_databases` or `read_csvs`. Instead, import CSVs into the database. 1126 | 1127 | ## Example Good Action Design 1128 | 1129 | - **Incremental:** Feed content in manageable chunks (500 lines or 1 page) instead of all at once. 1130 | 1131 | - **Overview-zoom-in:** First provide high-level structure (table of contents, summary), then allow drilling into details (raw texts). 1132 | 1133 | - **Parameterized/Programmable:** Instead of fixed actions, enable parameterized (columns to select) or programmable (SQL queries) actions, for example, to read CSV files. 1134 | 1135 | - **Backtracking:** Let the agent undo the last step instead of restarting entirely, preserving progress when encountering errors or dead ends. 1136 | 1137 | ## Example: Search Agent 1138 | 1139 | This agent: 1140 | 1. Decides whether to search or answer 1141 | 2. If searches, loops back to decide if more search needed 1142 | 3. Answers when enough context gathered 1143 | 1144 | ```python 1145 | class DecideAction(Node): 1146 | def prep(self, shared): 1147 | context = shared.get("context", "No previous search") 1148 | query = shared["query"] 1149 | return query, context 1150 | 1151 | def exec(self, inputs): 1152 | query, context = inputs 1153 | prompt = f""" 1154 | Given input: {query} 1155 | Previous search results: {context} 1156 | Should I: 1) Search web for more info 2) Answer with current knowledge 1157 | Output in yaml: 1158 | ```yaml 1159 | action: search/answer 1160 | reason: why this action 1161 | search_term: search phrase if action is search 1162 | ```""" 1163 | resp = call_llm(prompt) 1164 | yaml_str = resp.split("```yaml")[1].split("```")[0].strip() 1165 | result = yaml.safe_load(yaml_str) 1166 | 1167 | assert isinstance(result, dict) 1168 | assert "action" in result 1169 | assert "reason" in result 1170 | assert result["action"] in ["search", "answer"] 1171 | if result["action"] == "search": 1172 | assert "search_term" in result 1173 | 1174 | return result 1175 | 1176 | def post(self, shared, prep_res, exec_res): 1177 | if exec_res["action"] == "search": 1178 | shared["search_term"] = exec_res["search_term"] 1179 | return exec_res["action"] 1180 | 1181 | class SearchWeb(Node): 1182 | def prep(self, shared): 1183 | return shared["search_term"] 1184 | 1185 | def exec(self, search_term): 1186 | return search_web(search_term) 1187 | 1188 | def post(self, shared, prep_res, exec_res): 1189 | prev_searches = shared.get("context", []) 1190 | shared["context"] = prev_searches + [ 1191 | {"term": shared["search_term"], "result": exec_res} 1192 | ] 1193 | return "decide" 1194 | 1195 | class DirectAnswer(Node): 1196 | def prep(self, shared): 1197 | return shared["query"], shared.get("context", "") 1198 | 1199 | def exec(self, inputs): 1200 | query, context = inputs 1201 | return call_llm(f"Context: {context}\nAnswer: {query}") 1202 | 1203 | def post(self, shared, prep_res, exec_res): 1204 | print(f"Answer: {exec_res}") 1205 | shared["answer"] = exec_res 1206 | 1207 | # Connect nodes 1208 | decide = DecideAction() 1209 | search = SearchWeb() 1210 | answer = DirectAnswer() 1211 | 1212 | decide - "search" >> search 1213 | decide - "answer" >> answer 1214 | search - "decide" >> decide # Loop back 1215 | 1216 | flow = Flow(start=decide) 1217 | flow.run({"query": "Who won the Nobel Prize in Physics 2024?"}) 1218 | ``` 1219 | 1220 | ================================================ 1221 | File: docs/design_pattern/mapreduce.md 1222 | ================================================ 1223 | --- 1224 | layout: default 1225 | title: "Map Reduce" 1226 | parent: "Design Pattern" 1227 | nav_order: 4 1228 | --- 1229 | 1230 | # Map Reduce 1231 | 1232 | MapReduce is a design pattern suitable when you have either: 1233 | - Large input data (e.g., multiple files to process), or 1234 | - Large output data (e.g., multiple forms to fill) 1235 | 1236 | and there is a logical way to break the task into smaller, ideally independent parts. 1237 | 1238 |
1239 | 1240 |
1241 | 1242 | You first break down the task using [BatchNode](../core_abstraction/batch.md) in the map phase, followed by aggregation in the reduce phase. 1243 | 1244 | ### Example: Document Summarization 1245 | 1246 | ```python 1247 | class SummarizeAllFiles(BatchNode): 1248 | def prep(self, shared): 1249 | files_dict = shared["files"] # e.g. 10 files 1250 | return list(files_dict.items()) # [("file1.txt", "aaa..."), ("file2.txt", "bbb..."), ...] 1251 | 1252 | def exec(self, one_file): 1253 | filename, file_content = one_file 1254 | summary_text = call_llm(f"Summarize the following file:\n{file_content}") 1255 | return (filename, summary_text) 1256 | 1257 | def post(self, shared, prep_res, exec_res_list): 1258 | shared["file_summaries"] = dict(exec_res_list) 1259 | 1260 | class CombineSummaries(Node): 1261 | def prep(self, shared): 1262 | return shared["file_summaries"] 1263 | 1264 | def exec(self, file_summaries): 1265 | # format as: "File1: summary\nFile2: summary...\n" 1266 | text_list = [] 1267 | for fname, summ in file_summaries.items(): 1268 | text_list.append(f"{fname} summary:\n{summ}\n") 1269 | big_text = "\n---\n".join(text_list) 1270 | 1271 | return call_llm(f"Combine these file summaries into one final summary:\n{big_text}") 1272 | 1273 | def post(self, shared, prep_res, final_summary): 1274 | shared["all_files_summary"] = final_summary 1275 | 1276 | batch_node = SummarizeAllFiles() 1277 | combine_node = CombineSummaries() 1278 | batch_node >> combine_node 1279 | 1280 | flow = Flow(start=batch_node) 1281 | 1282 | shared = { 1283 | "files": { 1284 | "file1.txt": "Alice was beginning to get very tired of sitting by her sister...", 1285 | "file2.txt": "Some other interesting text ...", 1286 | # ... 1287 | } 1288 | } 1289 | flow.run(shared) 1290 | print("Individual Summaries:", shared["file_summaries"]) 1291 | print("\nFinal Summary:\n", shared["all_files_summary"]) 1292 | ``` 1293 | 1294 | ================================================ 1295 | File: docs/design_pattern/rag.md 1296 | ================================================ 1297 | --- 1298 | layout: default 1299 | title: "RAG" 1300 | parent: "Design Pattern" 1301 | nav_order: 3 1302 | --- 1303 | 1304 | # RAG (Retrieval Augmented Generation) 1305 | 1306 | For certain LLM tasks like answering questions, providing relevant context is essential. One common architecture is a **two-stage** RAG pipeline: 1307 | 1308 |
1309 | 1310 |
1311 | 1312 | 1. **Offline stage**: Preprocess and index documents ("building the index"). 1313 | 2. **Online stage**: Given a question, generate answers by retrieving the most relevant context. 1314 | 1315 | --- 1316 | ## Stage 1: Offline Indexing 1317 | 1318 | We create three Nodes: 1319 | 1. `ChunkDocs` – [chunks](../utility_function/chunking.md) raw text. 1320 | 2. `EmbedDocs` – [embeds](../utility_function/embedding.md) each chunk. 1321 | 3. `StoreIndex` – stores embeddings into a [vector database](../utility_function/vector.md). 1322 | 1323 | ```python 1324 | class ChunkDocs(BatchNode): 1325 | def prep(self, shared): 1326 | # A list of file paths in shared["files"]. We process each file. 1327 | return shared["files"] 1328 | 1329 | def exec(self, filepath): 1330 | # read file content. In real usage, do error handling. 1331 | with open(filepath, "r", encoding="utf-8") as f: 1332 | text = f.read() 1333 | # chunk by 100 chars each 1334 | chunks = [] 1335 | size = 100 1336 | for i in range(0, len(text), size): 1337 | chunks.append(text[i : i + size]) 1338 | return chunks 1339 | 1340 | def post(self, shared, prep_res, exec_res_list): 1341 | # exec_res_list is a list of chunk-lists, one per file. 1342 | # flatten them all into a single list of chunks. 1343 | all_chunks = [] 1344 | for chunk_list in exec_res_list: 1345 | all_chunks.extend(chunk_list) 1346 | shared["all_chunks"] = all_chunks 1347 | 1348 | class EmbedDocs(BatchNode): 1349 | def prep(self, shared): 1350 | return shared["all_chunks"] 1351 | 1352 | def exec(self, chunk): 1353 | return get_embedding(chunk) 1354 | 1355 | def post(self, shared, prep_res, exec_res_list): 1356 | # Store the list of embeddings. 1357 | shared["all_embeds"] = exec_res_list 1358 | print(f"Total embeddings: {len(exec_res_list)}") 1359 | 1360 | class StoreIndex(Node): 1361 | def prep(self, shared): 1362 | # We'll read all embeds from shared. 1363 | return shared["all_embeds"] 1364 | 1365 | def exec(self, all_embeds): 1366 | # Create a vector index (faiss or other DB in real usage). 1367 | index = create_index(all_embeds) 1368 | return index 1369 | 1370 | def post(self, shared, prep_res, index): 1371 | shared["index"] = index 1372 | 1373 | # Wire them in sequence 1374 | chunk_node = ChunkDocs() 1375 | embed_node = EmbedDocs() 1376 | store_node = StoreIndex() 1377 | 1378 | chunk_node >> embed_node >> store_node 1379 | 1380 | OfflineFlow = Flow(start=chunk_node) 1381 | ``` 1382 | 1383 | Usage example: 1384 | 1385 | ```python 1386 | shared = { 1387 | "files": ["doc1.txt", "doc2.txt"], # any text files 1388 | } 1389 | OfflineFlow.run(shared) 1390 | ``` 1391 | 1392 | --- 1393 | ## Stage 2: Online Query & Answer 1394 | 1395 | We have 3 nodes: 1396 | 1. `EmbedQuery` – embeds the user’s question. 1397 | 2. `RetrieveDocs` – retrieves top chunk from the index. 1398 | 3. `GenerateAnswer` – calls the LLM with the question + chunk to produce the final answer. 1399 | 1400 | ```python 1401 | class EmbedQuery(Node): 1402 | def prep(self, shared): 1403 | return shared["question"] 1404 | 1405 | def exec(self, question): 1406 | return get_embedding(question) 1407 | 1408 | def post(self, shared, prep_res, q_emb): 1409 | shared["q_emb"] = q_emb 1410 | 1411 | class RetrieveDocs(Node): 1412 | def prep(self, shared): 1413 | # We'll need the query embedding, plus the offline index/chunks 1414 | return shared["q_emb"], shared["index"], shared["all_chunks"] 1415 | 1416 | def exec(self, inputs): 1417 | q_emb, index, chunks = inputs 1418 | I, D = search_index(index, q_emb, top_k=1) 1419 | best_id = I[0][0] 1420 | relevant_chunk = chunks[best_id] 1421 | return relevant_chunk 1422 | 1423 | def post(self, shared, prep_res, relevant_chunk): 1424 | shared["retrieved_chunk"] = relevant_chunk 1425 | print("Retrieved chunk:", relevant_chunk[:60], "...") 1426 | 1427 | class GenerateAnswer(Node): 1428 | def prep(self, shared): 1429 | return shared["question"], shared["retrieved_chunk"] 1430 | 1431 | def exec(self, inputs): 1432 | question, chunk = inputs 1433 | prompt = f"Question: {question}\nContext: {chunk}\nAnswer:" 1434 | return call_llm(prompt) 1435 | 1436 | def post(self, shared, prep_res, answer): 1437 | shared["answer"] = answer 1438 | print("Answer:", answer) 1439 | 1440 | embed_qnode = EmbedQuery() 1441 | retrieve_node = RetrieveDocs() 1442 | generate_node = GenerateAnswer() 1443 | 1444 | embed_qnode >> retrieve_node >> generate_node 1445 | OnlineFlow = Flow(start=embed_qnode) 1446 | ``` 1447 | 1448 | Usage example: 1449 | 1450 | ```python 1451 | # Suppose we already ran OfflineFlow and have: 1452 | # shared["all_chunks"], shared["index"], etc. 1453 | shared["question"] = "Why do people like cats?" 1454 | 1455 | OnlineFlow.run(shared) 1456 | # final answer in shared["answer"] 1457 | ``` 1458 | 1459 | ================================================ 1460 | File: docs/design_pattern/structure.md 1461 | ================================================ 1462 | --- 1463 | layout: default 1464 | title: "Structured Output" 1465 | parent: "Design Pattern" 1466 | nav_order: 5 1467 | --- 1468 | 1469 | # Structured Output 1470 | 1471 | In many use cases, you may want the LLM to output a specific structure, such as a list or a dictionary with predefined keys. 1472 | 1473 | There are several approaches to achieve a structured output: 1474 | - **Prompting** the LLM to strictly return a defined structure. 1475 | - Using LLMs that natively support **schema enforcement**. 1476 | - **Post-processing** the LLM's response to extract structured content. 1477 | 1478 | In practice, **Prompting** is simple and reliable for modern LLMs. 1479 | 1480 | ### Example Use Cases 1481 | 1482 | - Extracting Key Information 1483 | 1484 | ```yaml 1485 | product: 1486 | name: Widget Pro 1487 | price: 199.99 1488 | description: | 1489 | A high-quality widget designed for professionals. 1490 | Recommended for advanced users. 1491 | ``` 1492 | 1493 | - Summarizing Documents into Bullet Points 1494 | 1495 | ```yaml 1496 | summary: 1497 | - This product is easy to use. 1498 | - It is cost-effective. 1499 | - Suitable for all skill levels. 1500 | ``` 1501 | 1502 | - Generating Configuration Files 1503 | 1504 | ```yaml 1505 | server: 1506 | host: 127.0.0.1 1507 | port: 8080 1508 | ssl: true 1509 | ``` 1510 | 1511 | ## Prompt Engineering 1512 | 1513 | When prompting the LLM to produce **structured** output: 1514 | 1. **Wrap** the structure in code fences (e.g., `yaml`). 1515 | 2. **Validate** that all required fields exist (and let `Node` handles retry). 1516 | 1517 | ### Example Text Summarization 1518 | 1519 | ```python 1520 | class SummarizeNode(Node): 1521 | def exec(self, prep_res): 1522 | # Suppose `prep_res` is the text to summarize. 1523 | prompt = f""" 1524 | Please summarize the following text as YAML, with exactly 3 bullet points 1525 | 1526 | {prep_res} 1527 | 1528 | Now, output: 1529 | ```yaml 1530 | summary: 1531 | - bullet 1 1532 | - bullet 2 1533 | - bullet 3 1534 | ```""" 1535 | response = call_llm(prompt) 1536 | yaml_str = response.split("```yaml")[1].split("```")[0].strip() 1537 | 1538 | import yaml 1539 | structured_result = yaml.safe_load(yaml_str) 1540 | 1541 | assert "summary" in structured_result 1542 | assert isinstance(structured_result["summary"], list) 1543 | 1544 | return structured_result 1545 | ``` 1546 | 1547 | > Besides using `assert` statements, another popular way to validate schemas is [Pydantic](https://github.com/pydantic/pydantic) 1548 | {: .note } 1549 | 1550 | ### Why YAML instead of JSON? 1551 | 1552 | Current LLMs struggle with escaping. YAML is easier with strings since they don't always need quotes. 1553 | 1554 | **In JSON** 1555 | 1556 | ```json 1557 | { 1558 | "dialogue": "Alice said: \"Hello Bob.\\nHow are you?\\nI am good.\"" 1559 | } 1560 | ``` 1561 | 1562 | - Every double quote inside the string must be escaped with `\"`. 1563 | - Each newline in the dialogue must be represented as `\n`. 1564 | 1565 | **In YAML** 1566 | 1567 | ```yaml 1568 | dialogue: | 1569 | Alice said: "Hello Bob. 1570 | How are you? 1571 | I am good." 1572 | ``` 1573 | 1574 | - No need to escape interior quotes—just place the entire text under a block literal (`|`). 1575 | - Newlines are naturally preserved without needing `\n`. 1576 | 1577 | ================================================ 1578 | File: docs/design_pattern/workflow.md 1579 | ================================================ 1580 | --- 1581 | layout: default 1582 | title: "Workflow" 1583 | parent: "Design Pattern" 1584 | nav_order: 2 1585 | --- 1586 | 1587 | # Workflow 1588 | 1589 | Many real-world tasks are too complex for one LLM call. The solution is to **Task Decomposition**: decompose them into a [chain](../core_abstraction/flow.md) of multiple Nodes. 1590 | 1591 |
1592 | 1593 |
1594 | 1595 | > - You don't want to make each task **too coarse**, because it may be *too complex for one LLM call*. 1596 | > - You don't want to make each task **too granular**, because then *the LLM call doesn't have enough context* and results are *not consistent across nodes*. 1597 | > 1598 | > You usually need multiple *iterations* to find the *sweet spot*. If the task has too many *edge cases*, consider using [Agents](./agent.md). 1599 | {: .best-practice } 1600 | 1601 | ### Example: Article Writing 1602 | 1603 | ```python 1604 | class GenerateOutline(Node): 1605 | def prep(self, shared): return shared["topic"] 1606 | def exec(self, topic): return call_llm(f"Create a detailed outline for an article about {topic}") 1607 | def post(self, shared, prep_res, exec_res): shared["outline"] = exec_res 1608 | 1609 | class WriteSection(Node): 1610 | def prep(self, shared): return shared["outline"] 1611 | def exec(self, outline): return call_llm(f"Write content based on this outline: {outline}") 1612 | def post(self, shared, prep_res, exec_res): shared["draft"] = exec_res 1613 | 1614 | class ReviewAndRefine(Node): 1615 | def prep(self, shared): return shared["draft"] 1616 | def exec(self, draft): return call_llm(f"Review and improve this draft: {draft}") 1617 | def post(self, shared, prep_res, exec_res): shared["final_article"] = exec_res 1618 | 1619 | # Connect nodes 1620 | outline = GenerateOutline() 1621 | write = WriteSection() 1622 | review = ReviewAndRefine() 1623 | 1624 | outline >> write >> review 1625 | 1626 | # Create and run flow 1627 | writing_flow = Flow(start=outline) 1628 | shared = {"topic": "AI Safety"} 1629 | writing_flow.run(shared) 1630 | ``` 1631 | 1632 | For *dynamic cases*, consider using [Agents](./agent.md). 1633 | 1634 | ================================================ 1635 | File: docs/utility_function/llm.md 1636 | ================================================ 1637 | --- 1638 | layout: default 1639 | title: "LLM Wrapper" 1640 | parent: "Utility Function" 1641 | nav_order: 1 1642 | --- 1643 | 1644 | # LLM Wrappers 1645 | 1646 | Check out libraries like [litellm](https://github.com/BerriAI/litellm). 1647 | Here, we provide some minimal example implementations: 1648 | 1649 | 1. OpenAI 1650 | ```python 1651 | def call_llm(prompt): 1652 | from openai import OpenAI 1653 | client = OpenAI(api_key="YOUR_API_KEY_HERE") 1654 | r = client.chat.completions.create( 1655 | model="gpt-4o", 1656 | messages=[{"role": "user", "content": prompt}] 1657 | ) 1658 | return r.choices[0].message.content 1659 | 1660 | # Example usage 1661 | call_llm("How are you?") 1662 | ``` 1663 | > Store the API key in an environment variable like OPENAI_API_KEY for security. 1664 | {: .best-practice } 1665 | 1666 | 2. Claude (Anthropic) 1667 | ```python 1668 | def call_llm(prompt): 1669 | from anthropic import Anthropic 1670 | client = Anthropic(api_key="YOUR_API_KEY_HERE") 1671 | r = client.messages.create( 1672 | model="claude-sonnet-4-0", 1673 | messages=[ 1674 | {"role": "user", "content": prompt} 1675 | ] 1676 | ) 1677 | return r.content[0].text 1678 | ``` 1679 | 1680 | 3. Google (Generative AI Studio / PaLM API) 1681 | ```python 1682 | def call_llm(prompt): 1683 | from google import genai 1684 | client = genai.Client(api_key='GEMINI_API_KEY') 1685 | response = client.models.generate_content( 1686 | model='gemini-2.5-pro', 1687 | contents=prompt 1688 | ) 1689 | return response.text 1690 | ``` 1691 | 1692 | 4. Azure (Azure OpenAI) 1693 | ```python 1694 | def call_llm(prompt): 1695 | from openai import AzureOpenAI 1696 | client = AzureOpenAI( 1697 | azure_endpoint="https://.openai.azure.com/", 1698 | api_key="YOUR_API_KEY_HERE", 1699 | api_version="2023-05-15" 1700 | ) 1701 | r = client.chat.completions.create( 1702 | model="", 1703 | messages=[{"role": "user", "content": prompt}] 1704 | ) 1705 | return r.choices[0].message.content 1706 | ``` 1707 | 1708 | 5. Ollama (Local LLM) 1709 | ```python 1710 | def call_llm(prompt): 1711 | from ollama import chat 1712 | response = chat( 1713 | model="llama2", 1714 | messages=[{"role": "user", "content": prompt}] 1715 | ) 1716 | return response.message.content 1717 | ``` 1718 | 1719 | ## Improvements 1720 | Feel free to enhance your `call_llm` function as needed. Here are examples: 1721 | 1722 | - Handle chat history: 1723 | 1724 | ```python 1725 | def call_llm(messages): 1726 | from openai import OpenAI 1727 | client = OpenAI(api_key="YOUR_API_KEY_HERE") 1728 | r = client.chat.completions.create( 1729 | model="gpt-4o", 1730 | messages=messages 1731 | ) 1732 | return r.choices[0].message.content 1733 | ``` 1734 | 1735 | - Add in-memory caching 1736 | 1737 | ```python 1738 | from functools import lru_cache 1739 | 1740 | @lru_cache(maxsize=1000) 1741 | def call_llm(prompt): 1742 | # Your implementation here 1743 | pass 1744 | ``` 1745 | 1746 | > ⚠️ Caching conflicts with Node retries, as retries yield the same result. 1747 | > 1748 | > To address this, you could use cached results only if not retried. 1749 | {: .warning } 1750 | 1751 | 1752 | ```python 1753 | from functools import lru_cache 1754 | 1755 | @lru_cache(maxsize=1000) 1756 | def cached_call(prompt): 1757 | pass 1758 | 1759 | def call_llm(prompt, use_cache): 1760 | if use_cache: 1761 | return cached_call(prompt) 1762 | # Call the underlying function directly 1763 | return cached_call.__wrapped__(prompt) 1764 | 1765 | class SummarizeNode(Node): 1766 | def exec(self, text): 1767 | return call_llm(f"Summarize: {text}", self.cur_retry==0) 1768 | ``` 1769 | 1770 | - Enable logging: 1771 | 1772 | ```python 1773 | def call_llm(prompt): 1774 | import logging 1775 | logging.info(f"Prompt: {prompt}") 1776 | response = ... # Your implementation here 1777 | logging.info(f"Response: {response}") 1778 | return response 1779 | ``` --------------------------------------------------------------------------------