├── dspy-gepa-sql-generator
    ├── pyproject.toml
    ├── README.md
    └── nl2sql_gepa.py
├── dspy-fact-checker
    ├── pyproject.toml
    ├── LICENSE
    ├── README.md
    └── fact_check_rag.py
├── dspy-gepa-deidentification
    ├── pyproject.toml
    ├── LICENSE
    ├── README.md
    └── minimal_gepa_deid.py
├── dspy-gepa-researcher
    ├── pyproject.toml
    ├── LICENSE
    ├── README.md
    ├── report.md
    └── dspy_gepa_researcher.py
├── .gitignore
└── README.md


/dspy-gepa-sql-generator/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "dspy-gepa-sql-generator"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.13"
 7 | dependencies = [
 8 |     "dspy-ai>=3.0.3",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/dspy-fact-checker/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "dspy-fact-checker"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.13"
 7 | dependencies = [
 8 |     "dspy-ai>=3.0.3",
 9 |     "ipykernel>=7.0.1",
10 |     "wikipedia>=1.4.0",
11 | ]
12 | 


--------------------------------------------------------------------------------
/dspy-gepa-deidentification/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "dspy-gepa-deidentification"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.13"
 7 | dependencies = [
 8 |     "dspy-ai>=3.0.3",
 9 |     "gepa>=0.0.7",
10 |     "ipykernel>=7.0.1",
11 | ]
12 | 


--------------------------------------------------------------------------------
/dspy-gepa-researcher/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "dspy-gepa-researcher"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.13"
 7 | dependencies = [
 8 |     "dspy>=3.0.3",
 9 |     "exa-py>=1.16.1",
10 |     "gepa>=0.0.7",
11 |     "ipykernel>=7.0.1",
12 |     "langgraph>=1.0.1",
13 |     "litellm>=1.78.7",
14 |     "pydantic>=2.12.3",
15 |     "python-dateutil>=2.9.0.post0",
16 | ]
17 | 


--------------------------------------------------------------------------------
/dspy-fact-checker/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dspy-gepa-researcher/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dspy-gepa-deidentification/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Python-generated files
  2 | __pycache__/
  3 | *.py[oc]
  4 | build/
  5 | dist/
  6 | wheels/
  7 | *.egg-info
  8 | 
  9 | # Virtual environments
 10 | .venv/
 11 | venv/
 12 | env/
 13 | ENV/
 14 | .virtualenv
 15 | 
 16 | # Environment variables and secrets
 17 | .env
 18 | .env.*
 19 | .env.local
 20 | .env.*.local
 21 | *.key
 22 | *.pem
 23 | secrets.yaml
 24 | secrets.json
 25 | credentials.json
 26 | .secrets/
 27 | config.local.*
 28 | 
 29 | # IDE and editor files
 30 | .vscode/
 31 | .idea/
 32 | *.swp
 33 | *.swo
 34 | *~
 35 | .DS_Store
 36 | *.sublime-project
 37 | *.sublime-workspace
 38 | 
 39 | # OS-specific files
 40 | .DS_Store
 41 | Thumbs.db
 42 | Desktop.ini
 43 | 
 44 | # Python testing and coverage
 45 | .pytest_cache/
 46 | .coverage
 47 | .coverage.*
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | 
 55 | # Type checking
 56 | .mypy_cache/
 57 | .dmypy.json
 58 | dmypy.json
 59 | .pyre/
 60 | .pytype/
 61 | 
 62 | # Jupyter Notebook
 63 | .ipynb_checkpoints
 64 | *.ipynb_checkpoints/
 65 | 
 66 | # Python distribution
 67 | *.egg
 68 | *.egg-info/
 69 | dist/
 70 | build/
 71 | eggs/
 72 | .eggs/
 73 | lib/
 74 | lib64/
 75 | parts/
 76 | sdist/
 77 | var/
 78 | wheels/
 79 | pip-wheel-metadata/
 80 | share/python-wheels/
 81 | 
 82 | # Logs and databases
 83 | *.log
 84 | *.sql
 85 | *.sqlite
 86 | *.db
 87 | 
 88 | # uv specific
 89 | .python-version
 90 | uv.lock
 91 | 
 92 | # DSPy specific
 93 | dspy_cache/
 94 | .dspy_cache/
 95 | compiled_programs/
 96 | *.dspy
 97 | dspy_*.json
 98 | 
 99 | # ML/AI artifacts
100 | models/
101 | checkpoints/
102 | *.pkl
103 | *.pickle
104 | *.joblib
105 | *.h5
106 | *.ckpt
107 | *.safetensors
108 | wandb/
109 | mlruns/
110 | experiments/
111 | outputs/
112 | 
113 | # Data files - IMPORTANT: exclude all data to prevent PII leaks
114 | data/
115 | *.csv
116 | *.tsv
117 | *.xlsx
118 | *.xls
119 | *.json
120 | *.jsonl
121 | *.parquet
122 | *.arrow
123 | *.feather
124 | raw_data/
125 | processed_data/
126 | test_data/
127 | sample_data/
128 | 
129 | # Exceptions for important config files (override *.json above)
130 | !package.json
131 | !tsconfig.json
132 | !pyproject.toml
133 | 
134 | # Exclude sample outputs that might contain generated text
135 | output/
136 | results/
137 | reports/*.txt
138 | reports/*.md
139 | 
140 | # Temporary and scratch files
141 | scratch/
142 | tmp/
143 | temp/
144 | *.tmp
145 | notes.txt
146 | todo.txt
147 | 
148 | # Backup files
149 | *.bak
150 | *.backup
151 | *~.nib
152 | *.orig
153 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DSPy Examples with GEPA
 2 | 
 3 | A collection of practical examples demonstrating how to use [DSPy](https://dspy.ai/).
 4 | 
 5 | ## About
 6 | 
 7 | This repository contains various examples showcasing different applications of DSPy (+ GEPA optimizer).
 8 | 
 9 | ### What is DSPy?
10 | 
11 | DSPy is a framework for algorithmically optimizing language model prompts and weights. Instead of manually tweaking prompts, you define what your system should do (the signature), and DSPy figures out how to do it through optimization.
12 | 
13 | ### What is GEPA?
14 | 
15 | GEPA (Generalized Evolution of Prompting via Adaptation) is a DSPy optimizer that:
16 | - Automatically improves prompts through iterative reflection
17 | - Learns from feedback metrics without requiring labeled data
18 | - Evolves instructions based on performance analysis
19 | - Enables zero-shot learning with just input/output pairs
20 | 
21 | ## Examples
22 | 
23 | ### 1. [PII De-identification](./dspy-gepa-deidentification/)
24 | 
25 | Demonstrates using GEPA to automatically optimize prompts for redacting personally identifiable information (PII) from incident reports.
26 | 
27 | **Key concepts:**
28 | - Automatic prompt optimization for sensitive data handling
29 | - Dual metric systems (simple and composite)
30 | - Structured output preservation
31 | - Feedback-driven learning
32 | 
33 | [View example →](./dspy-gepa-deidentification/)
34 | 
35 | ### 2. [Fact-Checked RAG](./dspy-fact-checker/)
36 | 
37 | A self-correcting Retrieval-Augmented Generation system that fact-checks its own answers against Wikipedia sources and automatically refines responses until they are fully supported by evidence.
38 | 
39 | **Key concepts:**
40 | - Self-correcting pipeline with dspy.Refine
41 | - Fact verification against retrieved context
42 | - Wikipedia integration for knowledge retrieval
43 | - Automatic retry and refinement
44 | 
45 | [View example →](./dspy-fact-checker/)
46 | 
47 | ### 3. [Natural Language to SQL](./dspy-gepa-sql-generator/)
48 | 
49 | Demonstrates using GEPA to optimize prompts for converting natural language questions into SQL queries, with comprehensive safety and correctness validation.
50 | 
51 | **Key concepts:**
52 | - Natural language to SQL generation
53 | - Custom metrics for safety, execution, and correctness
54 | - Database schema understanding
55 | - Query optimization through GEPA
56 | 
57 | [View example →](./dspy-gepa-sql-generator/)
58 | 
59 | ## Getting Started
60 | 
61 | Each example directory contains its own README with:
62 | - Detailed setup instructions
63 | - Usage examples
64 | - Configuration details
65 | - Code explanations
66 | 
67 | Navigate to any example directory to get started.
68 | 
69 | ## Requirements
70 | 
71 | Most examples require:
72 | - Python 3.13+
73 | - An OpenAI API key (or compatible LLM provider)
74 | - Dependencies managed via [uv](https://github.com/astral-sh/uv) or pip
75 | 
76 | ## Contributing
77 | 
78 | Additional examples are welcome! If you have a useful DSPy (+ GEPA) example to share:
79 | 1. Create a new directory with a descriptive name
80 | 2. Include a comprehensive README.md
81 | 3. Ensure dependencies are clearly documented
82 | 4. Add your example to this README's Examples section
83 | 5. Submit a pull request
84 | 
85 | ## Resources
86 | 
87 | - [DSPy Documentation](https://dspy.ai/)
88 | - [GEPA Optimizer Documentation](https://dspy.ai/api/optimizers/GEPA/overview/)
89 | - [DSPy GitHub Repository](https://github.com/stanfordnlp/dspy)
90 | 
91 | ## License
92 | 
93 | Individual examples may have their own licenses. Please check each example directory for details.
94 | 
95 | ---
96 | 
97 | **Note**: These are demonstration projects for educational purposes. Always validate and thoroughly test before using in production environments.
98 | 


--------------------------------------------------------------------------------
/dspy-gepa-deidentification/README.md:
--------------------------------------------------------------------------------
  1 | # DSPy GEPA for PII De-identification
  2 | 
  3 | A minimal example demonstrating how to use DSPy's GEPA (Generalized Evolution of Prompting via Adaptation) optimizer to automatically improve PII (Personally Identifiable Information) redaction in incident reports.
  4 | 
  5 | ## Overview
  6 | 
  7 | This project showcases how GEPA can optimize prompts for sensitive data de-identification tasks through reflection-based prompt evolution. The system learns to:
  8 | - Redact emails, phone numbers, and names using standard placeholders
  9 | - Preserve document structure (headers, bullet points)
 10 | - Maintain causal relationships and action items
 11 | - Avoid fabricating new information
 12 | 
 13 | ## Features
 14 | 
 15 | - **Automatic Prompt Optimization**: GEPA evolves instructions based on feedback metrics
 16 | - **Dual Metric System**: Includes both a simple and composite metric for evaluation
 17 | - **Structured Output**: Maintains "Root cause:" and "Action items:" sections with bullets
 18 | - **Zero-Shot Learning**: No labeled examples required - just input/output pairs
 19 | - **Feedback-Driven**: Rich textual feedback guides the optimization process
 20 | 
 21 | ## Installation
 22 | 
 23 | This project uses [uv](https://github.com/astral-sh/uv) for dependency management.
 24 | 
 25 | ```bash
 26 | # Install dependencies with uv
 27 | uv sync
 28 | 
 29 | # Or with pip
 30 | pip install dspy-ai gepa ipykernel
 31 | ```
 32 | 
 33 | ### Requirements
 34 | 
 35 | - Python 3.13+
 36 | - OpenAI API key (for GPT-4o and GPT-4o-mini)
 37 | 
 38 | ## Configuration
 39 | 
 40 | Create a `.env` file in the project root with your OpenAI API key:
 41 | 
 42 | ```bash
 43 | OPENAI_API_KEY=your-api-key-here
 44 | ```
 45 | 
 46 | **Important**: Never commit your `.env` file or API keys to version control.
 47 | 
 48 | ## Usage
 49 | 
 50 | Run the minimal example:
 51 | 
 52 | ```python
 53 | uv run minimal_gepa_deid.py
 54 | ```
 55 | 
 56 | The script will:
 57 | 1. Define a de-identification signature and module
 58 | 2. Configure GEPA with a reflection model
 59 | 3. Optimize the module on training examples
 60 | 4. Test on a sample incident report
 61 | 
 62 | ### Example Output
 63 | 
 64 | **Input:**
 65 | ```
 66 | Root cause: Dave Miller called 650-555-0000 to report breach.
 67 | Action items:
 68 | - email dave@contoso.com
 69 | - notify legal
 70 | ```
 71 | 
 72 | **Output:**
 73 | ```
 74 | Root cause: [NAME] called [PHONE] to report breach.
 75 | Action items:
 76 | - email [EMAIL]
 77 | - notify legal
 78 | ```
 79 | 
 80 | ## How It Works
 81 | 
 82 | 1. **Signature Definition**: Specifies what the module should do (not how)
 83 | 2. **Module Creation**: Uses `ChainOfThought` for reasoning about redactions
 84 | 3. **Metric with Feedback**: Returns both a score and textual guidance
 85 | 4. **GEPA Optimization**: Evolves internal instructions through reflection
 86 | 5. **Inference**: Apply the optimized module to new reports
 87 | 
 88 | ### Metrics
 89 | 
 90 | **Simple Metric (`pii_metric`)**:
 91 | - 60% score for zero PII leaks
 92 | - 20% for preserving "Root cause:" header
 93 | - 20% for preserving "Action items:" header
 94 | 
 95 | **Composite Metric (`composite_pii_metric`)**:
 96 | - Stricter checks including bullet point formatting
 97 | - Hallucination detection (no new PII introduction)
 98 | - Penalty-based scoring (1.0 - 0.25 � issues)
 99 | 
100 | ## Important Notes
101 | 
102 | ### Data Privacy
103 | - This is a **demonstration project** - do not use with real sensitive data without thorough testing
104 | - Never commit actual incident reports or PII to version control
105 | - All data files (`.csv`, `.json`, etc.) are git-ignored by default
106 | 
107 | ### Model Selection
108 | The example uses:
109 | - **Task LM**: `gpt-4o-mini` (faster, cheaper for execution)
110 | - **Reflection LM**: `gpt-4o` (stronger for meta-reasoning about prompts)
111 | 
112 | You can adjust these in the code based on your needs and budget.
113 | 
114 | ## Development
115 | 
116 | ### Running in Jupyter
117 | The script can also be run in Jupyter notebooks. The project includes `ipykernel` for this purpose.
118 | 
119 | ### Customization
120 | - Modify regex patterns in `EMAIL`, `PHONE`, `NAME` for your use case
121 | - Adjust scoring weights in the metric functions
122 | - Switch between `pii_metric` and `composite_pii_metric` in the GEPA configuration
123 | - Tune GEPA parameters (currently using `auto="light"` for quick demos)
124 | 
125 | ## GEPA Configuration
126 | 
127 | ```python
128 | gepa = dspy.GEPA(
129 |     metric=pii_metric,
130 |     auto="light",              # or "medium"/"heavy" for more optimization
131 |     reflection_lm=reflect_lm,  # Stronger model for reflection
132 |     track_stats=True,          # Track optimization statistics
133 |     track_best_outputs=True    # Keep best candidates per input
134 | )
135 | ```
136 | 
137 | ## References
138 | 
139 | - [DSPy Documentation](https://dspy.ai/)
140 | - [GEPA Overview](https://dspy.ai/api/optimizers/GEPA/overview/)
141 | - [DSPy GitHub Repository](https://github.com/stanfordnlp/dspy)
142 | 
143 | ## Contributing
144 | 
145 | Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
146 | 
147 | ## License
148 | 
149 | MIT License - see [LICENSE](LICENSE) for details.
150 | 
151 | ## Acknowledgments
152 | 
153 | Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP and [GEPA](https://dspy.ai/api/optimizers/GEPA/) optimizer.
154 | 
155 | ---
156 | 
157 | **Disclaimer**: This is a demonstration project for educational purposes. Always validate and test thoroughly before using in production environments with sensitive data.
158 | 
159 | ## Citation
160 | 
161 | If you use this project in your research, please cite:
162 | 
163 | ```bibtex
164 | @software{dspy_examples,
165 |   title={DSPy Multi-Agent Research Pipeline},
166 |   author={Your Name},
167 |   year={2025},
168 |   url={https://github.com/raja-patnaik/dspy-examples}
169 | }
170 | ```


--------------------------------------------------------------------------------
/dspy-fact-checker/README.md:
--------------------------------------------------------------------------------
  1 | # DSPy Fact-Checked RAG
  2 | 
  3 | A self-correcting Retrieval-Augmented Generation (RAG) system built with [DSPy](https://github.com/stanfordnlp/dspy) that fact-checks its own answers against Wikipedia sources. The system automatically refines responses until they are fully supported by retrieved evidence.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Self-Correcting Pipeline**: Uses `dspy.Refine` to iteratively improve answers based on verification feedback
  8 | - **Fact Verification**: Built-in verifier that checks if answer claims are supported by retrieved context
  9 | - **Wikipedia Integration**: Custom retriever that fetches relevant passages from Wikipedia
 10 | - **Automatic Retry**: Continues refining answers until all claims are verifiable (up to max attempts)
 11 | - **Source Attribution**: Embeds Wikipedia URLs in context for transparency
 12 | 
 13 | ## How It Works
 14 | 
 15 | 1. **Retrieve**: Fetches relevant Wikipedia passages based on the question
 16 | 2. **Generate**: Creates an answer using only information from the retrieved context
 17 | 3. **Verify**: Checks if the answer contains any unsupported claims
 18 | 4. **Refine**: If verification fails, automatically regenerates with feedback until the answer is fully supported
 19 | 
 20 | The pipeline uses DSPy's `Refine` module with a reward function that scores 1.0 only when the verifier confirms all claims are supported by the context.
 21 | 
 22 | ## Requirements
 23 | 
 24 | - Python 3.13+
 25 | - OpenAI API key (uses `gpt-4o-mini` by default)
 26 | 
 27 | ## Installation
 28 | 
 29 | 1. Change to the project folder after cloning the repo:
 30 | ```bash
 31 | cd dspy-fact-checker
 32 | ```
 33 | 
 34 | 2. Install dependencies using [uv](https://github.com/astral-sh/uv):
 35 | ```bash
 36 | uv sync
 37 | ```
 38 | 
 39 | Or with pip:
 40 | ```bash
 41 | pip install dspy-ai wikipedia
 42 | ```
 43 | 
 44 | 3. Set your OpenAI API key:
 45 | ```bash
 46 | export OPENAI_API_KEY="your-api-key-here"  # Linux/Mac
 47 | # OR
 48 | set OPENAI_API_KEY=your-api-key-here  # Windows
 49 | ```
 50 | 
 51 | ## Usage
 52 | 
 53 | Run the example:
 54 | ```bash
 55 | uv run fact_check_rag.py
 56 | ```
 57 | 
 58 | ### Example Output
 59 | 
 60 | ```
 61 | Question: When did Apollo 11 land on the Moon, and who were the astronauts involved?
 62 | --------------------------------------------------------------------------------
 63 | Final Answer:
 64 | Apollo 11 landed on the Moon on July 20, 1969. The astronauts involved were
 65 | Neil Armstrong, Buzz Aldrin, and Michael Collins, with Armstrong and Aldrin
 66 | walking on the lunar surface while Collins remained in orbit.
 67 | --------------------------------------------------------------------------------
 68 | Unsupported Claims: None
 69 | --------------------------------------------------------------------------------
 70 | Context used:
 71 | [1] Apollo 11: Apollo 11 was the American spaceflight that first landed humans...
 72 | [2] Neil Armstrong: Neil Alden Armstrong was an American astronaut...
 73 | ```
 74 | 
 75 | ### Customizing Questions
 76 | 
 77 | Edit the `question` variable in `fact_check_rag.py`:
 78 | 
 79 | ```python
 80 | question = "Who discovered penicillin and in which year was it first reported?"
 81 | # or
 82 | question = "When was the first FIFA World Cup held, and where?"
 83 | ```
 84 | 
 85 | ### Configuration Options
 86 | 
 87 | Adjust the RAG parameters:
 88 | 
 89 | ```python
 90 | program = FactCheckedRAG(
 91 |     k_passages=4,      # Number of Wikipedia passages to retrieve
 92 |     max_attempts=3     # Maximum refinement iterations
 93 | )
 94 | ```
 95 | 
 96 | Customize the Wikipedia retriever:
 97 | 
 98 | ```python
 99 | wiki_rm = WikipediaRetriever(
100 |     max_chars_per_passage=1500,  # Characters per passage
101 |     language="en"                 # Wikipedia language code
102 | )
103 | ```
104 | 
105 | ## Architecture
106 | 
107 | ### Components
108 | 
109 | - **WikipediaRetriever**: Custom retriever that searches Wikipedia and formats results for DSPy
110 | - **GenerateAnswer**: Signature for creating answers strictly from provided context
111 | - **VerifyAnswer**: Signature for identifying unsupported claims in answers
112 | - **FactCheckedRAG**: Main module combining retrieval, generation, and verification with refinement
113 | 
114 | ### Key Technologies
115 | 
116 | - [DSPy](https://github.com/stanfordnlp/dspy): Framework for programming language models
117 | - [Wikipedia API](https://pypi.org/project/wikipedia/): Python library for Wikipedia data
118 | - OpenAI GPT-4o-mini: Language model for generation and verification
119 | 
120 | ## Advanced Usage
121 | 
122 | ### Using Different LLMs
123 | 
124 | Replace the OpenAI configuration with other supported DSPy models:
125 | 
126 | ```python
127 | # Anthropic Claude
128 | lm = dspy.LM("anthropic/claude-3-5-sonnet-20241022", api_key=os.environ.get("ANTHROPIC_API_KEY"))
129 | 
130 | # Local models
131 | lm = dspy.LM("ollama/llama2")
132 | ```
133 | 
134 | ### Custom Retrievers
135 | 
136 | Extend the retriever for other knowledge sources:
137 | 
138 | ```python
139 | class CustomRetriever:
140 |     def __call__(self, query: str, k: int = 8):
141 |         # Implement your retrieval logic
142 |         # Must return list[dotdict] with `.long_text` attribute
143 |         pass
144 | ```
145 | 
146 | ## Limitations
147 | 
148 | - Relies on Wikipedia data quality and coverage
149 | - Answer quality depends on the underlying LLM's capabilities
150 | - May require multiple refinement attempts for complex questions
151 | - English Wikipedia by default (configurable)
152 | 
153 | ## Contributing
154 | 
155 | Contributions are welcome! Please feel free to submit issues or pull requests.
156 | 
157 | ## License
158 | 
159 | MIT License - see [LICENSE](LICENSE) for details.
160 | 
161 | ## Acknowledgments
162 | 
163 | - Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP
164 | - Uses the [Wikipedia API](https://pypi.org/project/wikipedia/) for knowledge retrieval
165 | 
166 | ## Citation
167 | 
168 | If you use this project in your research, please cite:
169 | 
170 | ```bibtex
171 | @software{dspy_examples,
172 |   title={DSPy Multi-Agent Research Pipeline},
173 |   author={Your Name},
174 |   year={2025},
175 |   url={https://github.com/raja-patnaik/dspy-examples}
176 | }
177 | ```


--------------------------------------------------------------------------------
/dspy-gepa-deidentification/minimal_gepa_deid.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import dspy
  3 | 
  4 | 
  5 | # 0) Pick task + reflection models (reflection ≈ stronger)
  6 | task_lm = dspy.LM("openai/gpt-4o-mini")
  7 | reflect_lm = dspy.LM("openai/gpt-4o")
  8 | dspy.configure(lm=task_lm)  # global default LM for modules :contentReference[oaicite:8]{index=8}
  9 | 
 10 | # 1) Signature: what the module does (not how to prompt)
 11 | class DeIDSignature(dspy.Signature):
 12 |     """Rewrite an incident report to remove PII while preserving causal structure and action items."""
 13 |     report = dspy.InputField(desc="Raw incident report text.")
 14 |     rules  = dspy.InputField(desc="Redaction rules and required output format.")
 15 |     clean_report = dspy.OutputField(
 16 |         desc="Redacted report using [EMAIL], [PHONE], [NAME]. Keep 'Root cause:' + 'Action items:' and bullets."
 17 |     )
 18 | 
 19 | # 2) Module: we’ll let GEPA evolve its internal instructions
 20 | class DeIDProgram(dspy.Module):
 21 |     def __init__(self):
 22 |         super().__init__()
 23 |         self.rewriter = dspy.ChainOfThought(DeIDSignature)  # adds .reasoning field to the prediction :contentReference[oaicite:9]{index=9}
 24 |     def forward(self, report, rules):
 25 |         return self.rewriter(report=report, rules=rules)
 26 | 
 27 | student = DeIDProgram()
 28 | 
 29 | # 3) Tiny “dataset”: GEPA doesn’t require labels, just examples to evaluate on
 30 | RULES = """Redact emails, phone numbers, and full names. Use placeholders [EMAIL], [PHONE], [NAME].
 31 | Keep section headers and bullets. Output format:
 32 | Root cause: ...
 33 | Action items: ...
 34 | - bullets for action items"""
 35 | 
 36 | trainset = [
 37 |     dspy.Example(
 38 |         report="Root cause: Alice Chen emailed ops (alice.chen@acme.io).\nAction items:\n- Call +1 (415) 555-0199 to notify vendor.",
 39 |         rules=RULES
 40 |     ).with_inputs("report", "rules"),
 41 |     dspy.Example(
 42 |         report="Root cause: Misconfigured S3 bucket by Bob A.\nAction items:\n- Rotate keys\n- email secops@company.com with incident ID 12345",
 43 |         rules=RULES
 44 |     ).with_inputs("report", "rules"),
 45 | ]
 46 | 
 47 | devset = [
 48 |     dspy.Example(
 49 |         report="Root cause: OT sensor alert phoned to 212-555-0101 by Carol Q.\nAction items:\n- File ticket\n- email ops@example.org",
 50 |         rules=RULES
 51 |     ).with_inputs("report", "rules"),
 52 | ]
 53 | # Note: .with_inputs tells DSPy which fields are inputs for evaluation/compilation. :contentReference[oaicite:10]{index=10}
 54 | 
 55 | # 4) Metric with feedback: score + *text* guidance for GEPA
 56 | EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}")
 57 | PHONE = re.compile(r"(?:\\+?\\d{1,3}[-. (]*)?\\d{3}[-. )]*\\d{3}[-. ]*\\d{4}")
 58 | NAME  = re.compile(r"\\b([A-Z][a-z]+ [A-Z][a-z]+)\\b")
 59 | 
 60 | def pii_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
 61 |     text = (pred.clean_report or "").strip()
 62 |     leaks = []
 63 |     if EMAIL.search(text): 
 64 |         leaks.append("email")
 65 |     if PHONE.search(text): 
 66 |         leaks.append("phone")
 67 |     if NAME.search(gold.report) and "[NAME]" not in text:
 68 |         leaks.append("name")
 69 | 
 70 |     keeps_root = "Root cause:" in text
 71 |     keeps_actions = "Action items:" in text
 72 | 
 73 |     # Score ∈ [0,1]: 0.6 for zero leaks + 0.2 each for keeping the two sections
 74 |     score = (0.6 if not leaks else 0.0) + (0.2 if keeps_root else 0.0) + (0.2 if keeps_actions else 0.0)
 75 | 
 76 |     feedback = []
 77 |     if leaks:
 78 |         feedback.append(f"PII leaked: {', '.join(leaks)}. Replace PII with [EMAIL], [PHONE], [NAME].")
 79 |     if not keeps_root or not keeps_actions:
 80 |         missing = []
 81 |         if not keeps_root: 
 82 |             missing.append("keep 'Root cause:'")
 83 |         if not keeps_actions:
 84 |             missing.append("keep 'Action items:'")
 85 |         feedback.append("Also " + " and ".join(missing) + ".")
 86 |     if not feedback:
 87 |         feedback.append("Great: no PII and structure preserved. Prefer succinct edits; avoid adding facts.")
 88 | 
 89 |     return dspy.Prediction(score=score, feedback=" ".join(feedback))  # GEPA reads this feedback to evolve instructions.
 90 | 
 91 | 
 92 | # Slightly stricter composite metric
 93 | def composite_pii_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
 94 |     text = (pred.clean_report or "").strip()
 95 |     issues = []
 96 | 
 97 |     # 1) PII leak checks (extend with better detectors as needed)
 98 |     leaks = []
 99 |     if EMAIL.search(text): 
100 |         leaks.append("email")
101 |     if PHONE.search(text): 
102 |         leaks.append("phone")
103 |     if NAME.search(gold.report) and "[NAME]" not in text: 
104 |         leaks.append("name")
105 |     if leaks: 
106 |         issues.append(f"PII leaked: {', '.join(leaks)}; replace with placeholders.")
107 | 
108 |     # 2) Structure invariants
109 |     if "Root cause:" not in text:  
110 |         issues.append("Missing header: 'Root cause:'.")
111 |     if "Action items:" not in text: 
112 |         issues.append("Missing header: 'Action items:'.")
113 | 
114 |     # 3) Formatting: require bullets for action items
115 |     if "Action items:" in text:
116 |         after = text.split("Action items:", 1)[1]
117 |         if "-" not in after and "\n•" not in after:
118 |             issues.append("Action items must be bulleted with '-' or '•'.")
119 | 
120 |     # 4) No fabrication: forbid adding new emails/phones beyond placeholders
121 |     hallucination = EMAIL.findall(text) or PHONE.findall(text)
122 |     if hallucination: 
123 |         issues.append("Do not introduce new PII; use placeholders only.")
124 | 
125 |     # Score scheme
126 |     base = 1.0
127 |     penalty = 0.25 * len(issues)  # tune per your tolerance
128 |     score = max(0.0, base - penalty)
129 |     feedback = " ".join(issues) if issues else (
130 |         "Great: no leaks, headers intact, bullets present; keep edits minimal and factual."
131 |     )
132 |     return dspy.Prediction(score=score, feedback=feedback)
133 | 
134 | 
135 | # 5) Run GEPA (reflection model must be provided)
136 | gepa = dspy.GEPA(
137 |     # metric=pii_metric,
138 |     metric=composite_pii_metric,
139 |     auto="light",
140 |     reflection_lm=reflect_lm,
141 |     track_stats=True,
142 |     track_best_outputs=True  # also useful as an inference-time search to surface best candidates per input
143 | )  # See GEPA API for params like candidate_selection_strategy='pareto'. :contentReference[oaicite:12]{index=12}
144 | 
145 | optimized = gepa.compile(student, trainset=trainset, valset=devset)
146 | 
147 | # 6) Try it
148 | test_report = (
149 |     "Root cause: Dave Miller called 650-555-0000 to report breach.\n"
150 |     "Action items:\n- email dave@contoso.com\n- notify legal"
151 | )
152 | print(optimized(report=test_report, rules=RULES).clean_report)
153 | 
154 | # Optional: Inspect the Pareto/best outputs per instance
155 | # print(optimized.detailed_results.best_outputs_valset)  # requires track_best_outputs=True :contentReference[oaicite:13]{index=13}


--------------------------------------------------------------------------------
/dspy-fact-checker/fact_check_rag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dspy
  3 | import wikipedia
  4 | from dspy.dsp.utils import dotdict
  5 | 
  6 | 
  7 | # --------------------------------------------------------------------------------------
  8 | # Wikipedia API Retriever that returns objects with a `.long_text` field (DSPy expects this)
  9 | # --------------------------------------------------------------------------------------
 10 | class WikipediaRetriever:
 11 |     """Simple retriever using the Wikipedia Python API."""
 12 | 
 13 |     def __init__(self, max_chars_per_passage: int = 1500, language: str = "en"):
 14 |         self.max_chars = max_chars_per_passage
 15 |         wikipedia.set_lang(language)
 16 | 
 17 |     def __call__(self, query: str, k: int = 8, **kwargs):
 18 |         """Return a list[dotdict] where each item has a `.long_text` attribute."""
 19 |         try:
 20 |             titles = wikipedia.search(query, results=k) or []
 21 |         except Exception:
 22 |             titles = []
 23 | 
 24 |         passages = []
 25 | 
 26 |         for title in titles[:k]:
 27 |             page = None
 28 |             picked_title = title
 29 | 
 30 |             try:
 31 |                 page = wikipedia.page(title, auto_suggest=False)
 32 |             except wikipedia.exceptions.DisambiguationError as e:
 33 |                 # Resolve disambiguation by trying the first few options
 34 |                 for opt in e.options[:3]:
 35 |                     try:
 36 |                         page = wikipedia.page(opt, auto_suggest=False)
 37 |                         picked_title = opt
 38 |                         break
 39 |                     except Exception:
 40 |                         continue
 41 |             except Exception:
 42 |                 pass
 43 | 
 44 |             if not page:
 45 |                 continue
 46 | 
 47 |             text = (page.summary or "").strip()
 48 |             if not text:
 49 |                 continue
 50 | 
 51 |             if len(text) > self.max_chars:
 52 |                 text = text[: self.max_chars].rstrip() + "..."
 53 | 
 54 |             # IMPORTANT: Return a structure with `.long_text`
 55 |             # We also embed title + URL into the long_text so they survive DSPy's mapping.
 56 |             long_text = f"{picked_title}: {text} (Source: {page.url})"
 57 |             passages.append(
 58 |                 dotdict(
 59 |                     {"long_text": long_text, "title": picked_title, "url": page.url}
 60 |                 )
 61 |             )
 62 | 
 63 |             if len(passages) >= k:
 64 |                 break
 65 | 
 66 |         return passages
 67 | 
 68 | 
 69 | # --------------------------------------------------------------------------------------
 70 | # OpenAI LM configuration (uses OPENAI_API_KEY from env)
 71 | # --------------------------------------------------------------------------------------
 72 | # You can also pass api_key=... directly to dspy.LM if you prefer
 73 | lm = dspy.LM("openai/gpt-4o-mini", api_key=os.environ.get("OPENAI_API_KEY"))
 74 | dspy.configure(
 75 |     lm=lm
 76 | )  # Official pattern for configuring the default LM. :contentReference[oaicite:1]{index=1}
 77 | 
 78 | # Configure the retriever for dspy.Retrieve
 79 | wiki_rm = WikipediaRetriever(max_chars_per_passage=1500, language="en")
 80 | dspy.settings.configure(rm=wiki_rm)
 81 | 
 82 | 
 83 | # --------------------------------------------------------------------------------------
 84 | # Signatures
 85 | # --------------------------------------------------------------------------------------
 86 | class GenerateAnswer(dspy.Signature):
 87 |     """Answer the question strictly from the provided context.
 88 |     - Use only facts present in the context.
 89 |     - If the context doesn't contain the answer, say you don't know.
 90 |     - Keep the answer concise (2-5 sentences).
 91 |     """
 92 | 
 93 |     context = dspy.InputField(desc="retrieved passages from Wikipedia")
 94 |     question = dspy.InputField()
 95 |     answer = dspy.OutputField(desc="factual answer derived ONLY from the context")
 96 | 
 97 | 
 98 | class VerifyAnswer(dspy.Signature):
 99 |     """Given 'context' and an 'answer', list any claims in the answer that are NOT supported by the context.
100 |     Output 'None' if every claim is supported.
101 |     """
102 | 
103 |     context = dspy.InputField(desc="retrieved passages from Wikipedia")
104 |     answer = dspy.InputField(desc="candidate answer to verify")
105 |     unsupported_claims = dspy.OutputField(
106 |         desc="List unsupported claims or 'None' if fully supported."
107 |     )
108 | 
109 | 
110 | # --------------------------------------------------------------------------------------
111 | # Self-correcting RAG with dspy.Refine
112 | # --------------------------------------------------------------------------------------
113 | class FactCheckedRAG(dspy.Module):
114 |     def __init__(self, k_passages: int = 4, max_attempts: int = 3):
115 |         super().__init__()
116 |         self.retrieve = dspy.Retrieve(k=k_passages)
117 |         self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
118 |         self.verify_answer = dspy.ChainOfThought(VerifyAnswer)
119 | 
120 |         # Reward: 1.0 when verifier returns "None"
121 |         def reward_fn(args, pred):
122 |             context_text = args["context"]
123 |             verification = self.verify_answer(context=context_text, answer=pred.answer)
124 |             uc = (verification.unsupported_claims or "").strip().lower()
125 |             return 1.0 if (uc == "" or uc == "none" or uc.startswith("none")) else 0.0
126 | 
127 |         # Retry generation with automatic feedback until reward meets threshold
128 |         self.refine_generate = dspy.Refine(
129 |             module=self.generate_answer,
130 |             N=max_attempts,
131 |             reward_fn=reward_fn,
132 |             threshold=1.0,
133 |         )
134 | 
135 |     def forward(self, question: str):
136 |         # Retrieve evidence (DSPy maps dotdicts -> list[str] via `.long_text`)
137 |         passages = self.retrieve(question).passages  # List[str]
138 |         context_text = "\n\n".join([f"[{i + 1}] {p}" for i, p in enumerate(passages)])
139 | 
140 |         # Generate (and refine if needed) until verified
141 |         pred = self.refine_generate(context=context_text, question=question)
142 |         answer = pred.answer
143 | 
144 |         # Final verification for reporting
145 |         final_check = self.verify_answer(context=context_text, answer=answer)
146 | 
147 |         return dspy.Prediction(
148 |             answer=answer,
149 |             context=context_text,
150 |             unsupported_claims=final_check.unsupported_claims,
151 |         )
152 | 
153 | 
154 | # --------------------------------------------------------------------------------------
155 | # Run
156 | # --------------------------------------------------------------------------------------
157 | if __name__ == "__main__":
158 |     program = FactCheckedRAG(k_passages=4, max_attempts=3)
159 | 
160 |     # Try any well-known topic
161 |     question = (
162 |         "When did Apollo 11 land on the Moon, and who were the astronauts involved?"
163 |     )
164 |     # question = "Who discovered penicillin and in which year was it first reported?"
165 |     # question = "When was the first FIFA World Cup held, and where?"
166 | 
167 |     result = program(question)
168 | 
169 |     print(f"\nQuestion: {question}")
170 |     print("-" * 80)
171 |     print("Final Answer:\n", result.answer)
172 |     print("-" * 80)
173 |     print("Unsupported Claims:", result.unsupported_claims)
174 |     print("-" * 80)
175 |     print("Context used:\n", result.context)
176 | 


--------------------------------------------------------------------------------
/dspy-gepa-sql-generator/README.md:
--------------------------------------------------------------------------------
  1 | # DSPy Natural Language to SQL with GEPA
  2 | 
  3 | A demonstration of using [DSPy](https://github.com/stanfordnlp/dspy) with the GEPA optimizer to automatically improve prompts for converting natural language questions into SQL queries. The system learns to generate safe, correct SQL through iterative reflection and feedback.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Automatic Prompt Optimization**: Uses GEPA to evolve SQL generation instructions
  8 | - **Safety Validation**: Blocks DDL/DML operations and enforces SELECT-only queries
  9 | - **Execution Verification**: Validates SQL syntax and execution correctness
 10 | - **Result Correctness**: Compares query results against expected outputs
 11 | - **Heuristic Guidance**: Provides feedback on ordering, limits, aliases, and output shape
 12 | - **Zero-Shot Learning**: Improves from input/output pairs without labeled prompts
 13 | 
 14 | ## How It Works
 15 | 
 16 | 1. **Schema Description**: Creates a compact representation of database tables with sample data
 17 | 2. **Query Generation**: Uses DSPy ChainOfThought to generate SQL from natural language
 18 | 3. **Multi-Level Validation**: Custom metric checks safety, execution, and correctness
 19 | 4. **GEPA Optimization**: Automatically refines prompts based on performance feedback
 20 | 5. **Iterative Improvement**: Learns from mistakes to generate better queries
 21 | 
 22 | The system uses a comprehensive metric that scores queries on:
 23 | - **Safety** (40%): Only SELECT/WITH statements, no forbidden operations
 24 | - **Execution** (30%): SQL must run without errors
 25 | - **Correctness** (30%): Results must match expected output
 26 | - **Heuristic penalties**: Deducted for missing ORDER BY, LIMIT, DISTINCT, etc.
 27 | 
 28 | ## Requirements
 29 | 
 30 | - Python 3.13+
 31 | - OpenAI API key (or compatible LLM provider)
 32 | - DSPy with GEPA optimizer support
 33 | 
 34 | ## Installation
 35 | 
 36 | 1. Change to the project folder after cloning the repo:
 37 | ```bash
 38 | cd dspy-gepa-sql-generator
 39 | ```
 40 | 
 41 | 2. Install dependencies using [uv](https://github.com/astral-sh/uv):
 42 | ```bash
 43 | uv sync
 44 | ```
 45 | 
 46 | Or with pip:
 47 | ```bash
 48 | pip install dspy-ai
 49 | ```
 50 | 
 51 | 3. Set your OpenAI API key:
 52 | ```bash
 53 | export OPENAI_API_KEY="your-api-key-here"  # Linux/Mac
 54 | # OR
 55 | set OPENAI_API_KEY=your-api-key-here  # Windows
 56 | ```
 57 | 
 58 | ## Usage
 59 | 
 60 | Run the example:
 61 | ```bash
 62 | uv run nl2sql_gepa.py
 63 | ```
 64 | 
 65 | The script will:
 66 | 1. Show baseline performance on development queries
 67 | 2. Run GEPA optimization on training queries
 68 | 3. Display post-optimization performance improvements
 69 | 4. Show before/after comparison on sample queries
 70 | 
 71 | ### Example Output
 72 | 
 73 | ```
 74 | == Baseline on devset ==
 75 | 
 76 | Q: Return the top 3 authors by total copies sold in 2024 (name + total_sold), descending.
 77 | SQL:
 78 |  SELECT a.name, SUM(s.sold) FROM authors a
 79 |  JOIN books b ON b.author_id = a.id
 80 |  JOIN sales s ON s.book_id = b.id
 81 |  WHERE s.year = 2024
 82 |  GROUP BY a.name
 83 | Score: 0.600
 84 | Feedback:
 85 | Use ORDER BY ... DESC for descending/top/most queries.
 86 | Add LIMIT 3 as requested (top 3).
 87 | 
 88 | ...
 89 | 
 90 | == PostGEPA on devset ==
 91 | 
 92 | Q: Return the top 3 authors by total copies sold in 2024 (name + total_sold), descending.
 93 | SQL:
 94 |  SELECT a.name, SUM(s.sold) AS total_sold
 95 |  FROM authors a
 96 |  JOIN books b ON b.author_id = a.id
 97 |  JOIN sales s ON s.book_id = b.id
 98 |  WHERE s.year = 2024
 99 |  GROUP BY a.name
100 |  ORDER BY total_sold DESC
101 |  LIMIT 3;
102 | Score: 1.000
103 | Feedback:
104 | Perfect score. Keep current strategy.
105 | ```
106 | 
107 | ## Configuration
108 | 
109 | ### Model Settings
110 | 
111 | Customize the models used for generation and reflection:
112 | 
113 | ```python
114 | STUDENT_MODEL = os.getenv("DSPY_STUDENT_MODEL", "openai/gpt-4o-mini")
115 | REFLECT_MODEL = os.getenv("DSPY_REFLECT_MODEL", "openai/gpt-4o")
116 | ```
117 | 
118 | Set via environment variables:
119 | ```bash
120 | export DSPY_STUDENT_MODEL="openai/gpt-4o-mini"
121 | export DSPY_REFLECT_MODEL="openai/gpt-4o"
122 | ```
123 | 
124 | ### Database Schema
125 | 
126 | The example uses an in-memory SQLite database with:
127 | - **authors**: id, name, country
128 | - **books**: id, title, year, author_id, genre, pages, price
129 | - **sales**: book_id, year, sold
130 | 
131 | Modify `setup_db()` to use your own schema and data.
132 | 
133 | ### Training Set
134 | 
135 | The example includes 14 questions ranging from simple to complex:
136 | - Basic filtering and joins
137 | - Aggregations and grouping
138 | - Ordering and limiting
139 | - String matching and case handling
140 | - Percentage calculations
141 | 
142 | Adjust `questions_and_gold_sql()` to add your own examples.
143 | 
144 | ### GEPA Parameters
145 | 
146 | Tune optimization settings:
147 | 
148 | ```python
149 | gepa = GEPA(
150 |     metric=sql_metric_scalar,
151 |     auto="medium",              # Optimization depth: "light", "medium", "heavy"
152 |     reflection_lm=reflection_lm, # Model for reflection
153 |     track_stats=True,           # Track optimization statistics
154 | )
155 | ```
156 | 
157 | ## Architecture
158 | 
159 | ### Components
160 | 
161 | **NL2SQL Signature**: Defines the input/output specification for SQL generation
162 | - Input: Database schema with sample rows, natural language question
163 | - Output: Single safe SQL SELECT statement
164 | 
165 | **NL2SQLProgram Module**: Wraps ChainOfThought for SQL generation
166 | 
167 | **WikipediaRetriever**: Custom metric combining multiple validation layers:
168 | - Safety checks (forbidden keywords, statement type)
169 | - Execution validation (syntax, runtime errors)
170 | - Correctness verification (result matching)
171 | - Heuristic penalties (ordering, limits, aliases)
172 | 
173 | **GEPA Optimizer**: Iteratively improves prompts through reflection
174 | 
175 | ### Key Technologies
176 | 
177 | - [DSPy](https://github.com/stanfordnlp/dspy): Framework for programming language models
178 | - [GEPA](https://dspy.ai/api/optimizers/GEPA/overview/): Generalized Evolution of Prompting via Adaptation
179 | - SQLite: Lightweight database for validation
180 | - OpenAI GPT-4o/GPT-4o-mini: Language models for generation and reflection
181 | 
182 | ## Advanced Usage
183 | 
184 | ### Custom Databases
185 | 
186 | Replace the in-memory SQLite database with your own:
187 | 
188 | ```python
189 | def setup_db() -> sqlite3.Connection:
190 |     conn = sqlite3.connect("your_database.db")
191 |     return conn
192 | ```
193 | 
194 | ### Different LLM Providers
195 | 
196 | Use other DSPy-supported models:
197 | 
198 | ```python
199 | # Anthropic Claude
200 | student_lm = dspy.LM("anthropic/claude-3-5-sonnet-20241022")
201 | 
202 | # Local models
203 | student_lm = dspy.LM("ollama/llama3")
204 | ```
205 | 
206 | ### Metric Customization
207 | 
208 | Adjust scoring weights in `sql_metric()`:
209 | 
210 | ```python
211 | # Current weights:
212 | # +0.4 for safety
213 | # +0.3 for execution
214 | # +0.3 for correctness
215 | # -0.05 for each heuristic violation
216 | ```
217 | 
218 | ## Limitations
219 | 
220 | - Requires gold SQL examples for correctness validation (can work without, but with reduced signal)
221 | - Limited to SELECT queries (DML/DDL operations are blocked)
222 | - Performance depends on LLM capabilities and schema complexity
223 | - English language queries by default
224 | 
225 | ## Contributing
226 | 
227 | Contributions are welcome! Please feel free to submit issues or pull requests.
228 | 
229 | ## License
230 | 
231 | MIT License - see [LICENSE](LICENSE) for details.
232 | 
233 | ## Acknowledgments
234 | 
235 | - Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP
236 | - Uses the [GEPA optimizer](https://dspy.ai/api/optimizers/GEPA/overview/) for automatic prompt improvement
237 | 


--------------------------------------------------------------------------------
/dspy-gepa-researcher/README.md:
--------------------------------------------------------------------------------
  1 | # DSPy Multi-Agent Research Pipeline
  2 | 
  3 | An intelligent, multi-agent research pipeline that autonomously conducts web research and generates comprehensive, citation-backed reports using DSPy, LangGraph, and the Exa search API.
  4 | 
  5 | ## Features
  6 | 
  7 | - **> Multi-Agent Architecture**: Coordinated agents for query planning, search, summarization, writing, and review using LangGraph
  8 | - **=
  9 |  Smart Web Research**: Powered by Exa API for high-quality web search with full-text retrieval
 10 | - **=� Automated Writing**: Generates polished Markdown reports with proper citations and references
 11 | - **<� Prompt Optimization**: Optional GEPA (Generalized Efficient Prompt Adaptation) for automatic prompt tuning
 12 | - **= Iterative Research**: Gap analysis determines when additional research is needed
 13 | - ** Quality Assurance**: Built-in review and revision cycle for report quality
 14 | - **=� Structured Citations**: Global citation management with numbered references
 15 | 
 16 | ## Architecture
 17 | 
 18 | The pipeline uses a graph-based workflow with the following stages:
 19 | 
 20 | 1. **Query Planning** � Generate diverse search queries with operators (site:, intitle:, quoted phrases)
 21 | 2. **Parallel Search** � Execute queries via Exa API and extract full-text content
 22 | 3. **Summarization** � Convert sources into cited evidence bullets
 23 | 4. **Gap Analysis** � Determine if more research is needed (up to 2 rounds)
 24 | 5. **Section Writing** � Draft sections with proper citations in parallel
 25 | 6. **Assembly & Review** � Combine sections, renumber citations globally, review quality
 26 | 7. **Revision** � Apply suggestions if quality checks fail
 27 | 
 28 | ### Technology Stack
 29 | 
 30 | - **[DSPy](https://github.com/stanfordnlp/dspy)**: Framework for programming language models with signatures
 31 | - **[LangGraph](https://github.com/langchain-ai/langgraph)**: Orchestration for multi-agent workflows with parallelism
 32 | - **[Exa API](https://exa.ai/)**: Neural search engine for web research
 33 | - **Google Gemini**: LLM backend (2.5 Flash for research, 2.5 Pro for writing)
 34 | - **GEPA**: Efficient prompt optimization using reflection
 35 | 
 36 | ## Requirements
 37 | 
 38 | - Python 3.10+
 39 | - API Keys:
 40 |   - Google Gemini API key
 41 |   - Exa API key (get from [dashboard.exa.ai](https://dashboard.exa.ai))
 42 | 
 43 | ## Installation
 44 | 
 45 | ```bash
 46 | # Clone the repository
 47 | cd dspy-gepa-researcher
 48 | 
 49 | # Install dependencies using uv (recommended)
 50 | uv add install dspy langgraph exa-py python-dateutil pydantic
 51 | 
 52 | # Or with pip
 53 | pip install dspy langgraph exa-py python-dateutil pydantic
 54 | ```
 55 | 
 56 | ## Configuration
 57 | 
 58 | Set the required environment variables:
 59 | 
 60 | ```bash
 61 | export GEMINI_API_KEY="your-gemini-api-key"
 62 | export EXA_API_KEY="your-exa-api-key"
 63 | 
 64 | # Optional configuration
 65 | export GEMINI_WRITER_MODEL="gemini/gemini-flash-latest"      # Default
 66 | export GEMINI_RESEARCH_MODEL="gemini/gemini-flash-latest"    # Default
 67 | export RR_MAX_ROUNDS="2"                                      # Research rounds
 68 | export RR_SEARCH_K="6"                                        # Results per query
 69 | export RR_MAX_CHARS="12000"                                   # Max chars per source
 70 | ```
 71 | 
 72 | ## Usage
 73 | 
 74 | ### Basic Usage
 75 | 
 76 | ```python
 77 | import asyncio
 78 | from dspy_gepa_researcher import run_pipeline, SectionSpec
 79 | 
 80 | # Define your research sections
 81 | sections = [
 82 |     SectionSpec(
 83 |         name="Executive Summary",
 84 |         instructions="In 180-250 words, summarize the most decision-relevant takeaways."
 85 |     ),
 86 |     SectionSpec(
 87 |         name="Market Landscape",
 88 |         instructions="Define the space; 2023-2025 trends; include 4+ specific figures with sources."
 89 |     ),
 90 |     SectionSpec(
 91 |         name="Key Players & Differentiation",
 92 |         instructions="Compare 5-7 players; list 1-2 distinctive capabilities each."
 93 |     ),
 94 | ]
 95 | 
 96 | # Run the pipeline
 97 | topic = "State of Edge AI Acceleration (2024-2025)"
 98 | result = asyncio.run(run_pipeline(
 99 |     topic=topic,
100 |     sections=sections,
101 |     optimization=False  # Set to True to enable GEPA optimization
102 | ))
103 | 
104 | # Report is saved to ./report.md
105 | ```
106 | 
107 | ### Running the Example
108 | 
109 | ```bash
110 | uv run dspy_gepa_researcher.py
111 | ```
112 | 
113 | This will generate a report on "State of Edge AI Acceleration (2024-2025)" with 5 sections.
114 | 
115 | ## How It Works
116 | 
117 | ### 1. Query Generation
118 | The `QUERY_GEN` agent creates 4-8 diverse search queries per section using:
119 | - Quoted phrases for exact matches
120 | - `site:` operators to target specific domains
121 | - `intitle:` to find relevant titles
122 | - Date ranges for time-specific research
123 | 
124 | ### 2. Parallel Search & Summarization
125 | Each query is executed in parallel:
126 | - Exa API retrieves top-k documents with full text
127 | - `SUMMARIZER` agent extracts key facts with `[S#]` citations
128 | - Results are aggregated per section
129 | 
130 | ### 3. Gap Analysis
131 | The `GAP_ANALYZER` agent reviews evidence for each section:
132 | - Determines if coverage is sufficient
133 | - Generates followup queries if needed
134 | - Proceeds to writing after max rounds or sufficient coverage
135 | 
136 | ### 4. Section Writing
137 | Sections are written in parallel:
138 | - `WRITE_SECTION` drafts markdown with `[n]` citations
139 | - `CITE_FIXER` ensures proper citation format
140 | - Character count and citation stats are tracked
141 | 
142 | ### 5. Assembly & Review
143 | - Citations are renumbered globally across all sections
144 | - `REVIEWER` agent checks for quality issues
145 | - `REVISER` agent applies suggestions if needed
146 | - Final report is saved with References section
147 | 
148 | ## GEPA Optimization
149 | 
150 | The pipeline supports automatic prompt optimization using GEPA (Generalized Efficient Prompt Adaptation). When enabled, it optimizes prompts for all 7 agents:
151 | 
152 | 1. `QUERY_GEN` - Search query generation
153 | 2. `SUMMARIZER` - Source summarization
154 | 3. `WRITE_SECTION` - Section writing
155 | 4. `GAP_ANALYZER` - Coverage analysis
156 | 5. `CITE_FIXER` - Citation formatting
157 | 6. `REVIEWER` - Quality review
158 | 7. `REVISER` - Report revision
159 | 
160 | Each agent is trained on 2-3 high-quality examples specific to its task.
161 | 
162 | To enable optimization:
163 | ```python
164 | result = asyncio.run(run_pipeline(topic, sections, optimization=True))
165 | ```
166 | 
167 | ## Output
168 | 
169 | The pipeline generates:
170 | 
171 | 1. **report.md**: Complete markdown report with:
172 |    - All sections with proper headings
173 |    - Inline numeric citations `[1]`, `[2]`, etc.
174 |    - References section with full metadata
175 |    - Source URLs, publication dates, and domains
176 | 
177 | 2. **Console Logs**: Detailed progress tracking:
178 |    ```
179 |    [QUERY] Planning search queries for 5 sections...
180 |    [SEARCH] 'Market Size': "market size TAM 2024"...
181 |    [SEARCH] � Found 6 documents
182 |    [SEARCH] � Extracted 4 evidence bullets
183 |    [GAP] Analyzing research coverage (Round 1/2)...
184 |    [WRITE] Drafting section: 'Market Size'
185 |    [WRITE] � 2341 chars, 8 citations
186 |    [REVIEW] � Pass: true, Issues: 0, Suggestions: 2
187 |    ```
188 | 
189 | 3. **Evaluation Metrics**:
190 |    - Overall quality score
191 |    - Citation count
192 |    - Character count
193 |    - Structure checks
194 | 
195 | ## Project Structure
196 | 
197 | ```
198 | dspy-gepa-researcher/
199 |   dspy_gepa_researcher.py    # Main pipeline implementation
200 |   README.md             # This file
201 |   report.md             # Generated output (after running)
202 |   LICENSE               # MIT License
203 | ```
204 | 
205 | ## Customization
206 | 
207 | ### Adding New Sections
208 | 
209 | ```python
210 | sections.append(SectionSpec(
211 |     name="Technology Stack",
212 |     instructions="List 5+ technologies with adoption rates, version info, and benchmarks"
213 | ))
214 | ```
215 | 
216 | ### Adjusting Research Depth
217 | 
218 | ```python
219 | # More queries per section (default: 6)
220 | export RR_SEARCH_K="10"
221 | 
222 | # More research rounds (default: 2)
223 | export RR_MAX_ROUNDS="3"
224 | ```
225 | 
226 | ### Changing Models
227 | 
228 | ```python
229 | export GEMINI_WRITER_MODEL="gemini/gemini-2.5-pro-preview-03-25"
230 | export GEMINI_RESEARCH_MODEL="gemini/gemini-2.5-flash-preview-03-25"
231 | ```
232 | 
233 | ## Limitations
234 | 
235 | - Requires active API keys (Gemini and Exa)
236 | - Token costs scale with number of sections and research rounds
237 | - Gemini models have output limits (8,192 tokens max per response)
238 | - Quality depends on Exa search result quality
239 | 
240 | ## Contributing
241 | 
242 | Contributions are welcome! Please feel free to submit a Pull Request.
243 | 
244 | ## License
245 | 
246 | MIT License - see [LICENSE](LICENSE) for details.
247 | 
248 | ## Acknowledgments
249 | 
250 | - Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP
251 | - Powered by [Exa](https://exa.ai/) neural search
252 | - Orchestrated with [LangGraph](https://github.com/langchain-ai/langgraph)
253 | 
254 | ## Citation
255 | 
256 | If you use this project in your research, please cite:
257 | 
258 | ```bibtex
259 | @software{dspy_examples,
260 |   title={DSPy Multi-Agent Research Pipeline},
261 |   author={Your Name},
262 |   year={2025},
263 |   url={https://github.com/raja-patnaik/dspy-examples}
264 | }
265 | ```
266 | 


--------------------------------------------------------------------------------
/dspy-gepa-sql-generator/nl2sql_gepa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import sqlite3
  4 | import inspect
  5 | from typing import List, Tuple, Optional
  6 | 
  7 | import dspy
  8 | 
  9 | # -----------------------------------------------------------------------------
 10 | # 0) Model configuration
 11 | # -----------------------------------------------------------------------------
 12 | STUDENT_MODEL = os.getenv("DSPY_STUDENT_MODEL", "openai/gpt-4o-mini")
 13 | REFLECT_MODEL = os.getenv("DSPY_REFLECT_MODEL", "openai/gpt-4o")
 14 | 
 15 | student_lm = dspy.LM(STUDENT_MODEL, temperature=0.2, max_tokens=800)
 16 | reflection_lm = dspy.LM(REFLECT_MODEL, temperature=0.8, max_tokens=2000)
 17 | dspy.configure(lm=student_lm)
 18 | 
 19 | # GEPA import (newer/older DSPy layouts)
 20 | try:
 21 |     from dspy import GEPA
 22 | except Exception:
 23 |     try:
 24 |         from dspy.teleprompt import GEPA  # older path
 25 |     except Exception:
 26 |         GEPA = None
 27 | 
 28 | 
 29 | # -----------------------------------------------------------------------------
 30 | # 1) Tiny SQLite database and schema description
 31 | # -----------------------------------------------------------------------------
 32 | def setup_db() -> sqlite3.Connection:
 33 |     conn = sqlite3.connect(":memory:")
 34 |     c = conn.cursor()
 35 | 
 36 |     c.executescript("""
 37 |     CREATE TABLE authors (
 38 |         id INTEGER PRIMARY KEY,
 39 |         name TEXT NOT NULL,
 40 |         country TEXT NOT NULL
 41 |     );
 42 |     CREATE TABLE books (
 43 |         id INTEGER PRIMARY KEY,
 44 |         title TEXT NOT NULL,
 45 |         year INTEGER NOT NULL,
 46 |         author_id INTEGER NOT NULL,
 47 |         genre TEXT NOT NULL,
 48 |         pages INTEGER NOT NULL,
 49 |         price REAL NOT NULL,
 50 |         FOREIGN KEY(author_id) REFERENCES authors(id)
 51 |     );
 52 |     CREATE TABLE sales (
 53 |         book_id INTEGER NOT NULL,
 54 |         year INTEGER NOT NULL,
 55 |         sold INTEGER NOT NULL,
 56 |         FOREIGN KEY(book_id) REFERENCES books(id)
 57 |     );
 58 |     """)
 59 | 
 60 |     authors = [
 61 |         (1, "Margaret Atwood", "Canada"),
 62 |         (2, "Haruki Murakami", "Japan"),
 63 |         (3, "Chimamanda Ngozi Adichie", "Nigeria"),
 64 |         (4, "Neil Gaiman", "UK"),
 65 |         (5, "Alice Munro", "Canada"),
 66 |     ]
 67 |     c.executemany("INSERT INTO authors VALUES (?, ?, ?)", authors)
 68 | 
 69 |     books = [
 70 |         (1, "The Handmaid's Tale", 1985, 1, "Dystopia", 311, 9.99),
 71 |         (2, "Kafka on the Shore", 2002, 2, "Magical Realism", 505, 14.99),
 72 |         (3, "American Gods", 2001, 4, "Fantasy", 465, 12.99),
 73 |         (4, "Half of a Yellow Sun", 2006, 3, "Historical", 448, 13.99),
 74 |         (5, "The Testaments", 2019, 1, "Dystopia", 419, 15.99),
 75 |         (6, "Norwegian Wood", 1987, 2, "Romance", 296, 10.99),
 76 |         (7, "Dear Life", 2012, 5, "Short Stories", 336, 11.99),
 77 |         (8, "Neverwhere", 1996, 4, "Fantasy", 370, 9.49),
 78 |         (9, "Oryx and Crake", 2003, 1, "Dystopia", 389, 11.49),
 79 |     ]
 80 |     c.executemany("INSERT INTO books VALUES (?, ?, ?, ?, ?, ?, ?)", books)
 81 | 
 82 |     sales = [
 83 |         (1, 2024, 12000),
 84 |         (2, 2024, 15000),
 85 |         (3, 2024, 16000),
 86 |         (4, 2024, 11000),
 87 |         (5, 2024, 9000),
 88 |         (6, 2024, 13000),
 89 |         (7, 2024, 7000),
 90 |         (8, 2024, 8000),
 91 |         (9, 2024, 10000),
 92 |     ]
 93 |     c.executemany("INSERT INTO sales VALUES (?, ?, ?)", sales)
 94 | 
 95 |     conn.commit()
 96 |     return conn
 97 | 
 98 | 
 99 | def describe_schema(conn: sqlite3.Connection, sample_rows: int = 2) -> str:
100 |     """Create a compact, LM-friendly schema string with a couple of sample rows per table."""
101 |     c = conn.cursor()
102 |     parts = []
103 |     for table in ["authors", "books", "sales"]:
104 |         c.execute(f"PRAGMA table_info({table})")
105 |         cols = [f"{row[1]}:{row[2]}" for row in c.fetchall()]
106 |         parts.append(f"TABLE {table}({', '.join(cols)})")
107 |         c.execute(f"SELECT * FROM {table} LIMIT {sample_rows}")
108 |         rows = c.fetchall()
109 |         parts.append(f"EXAMPLE_ROWS {table}: {rows}")
110 |     return "\n".join(parts)
111 | 
112 | 
113 | # -----------------------------------------------------------------------------
114 | # 2) DSPy program (Signature + Module)
115 | # -----------------------------------------------------------------------------
116 | class NL2SQL(dspy.Signature):
117 |     """Generate a single safe SQLite SELECT to answer the question from the given schema."""
118 | 
119 |     schema = dspy.InputField(desc="SQLite schema and 1–2 sample rows per table")
120 |     question = dspy.InputField(desc="Natural-language question about the data")
121 |     sql = dspy.OutputField(
122 |         desc=(
123 |             "Only ONE statement. Start with SELECT or WITH. "
124 |             "Use exact column/table names. "
125 |             "Return only the SQL (no comments)."
126 |         )
127 |     )
128 | 
129 | 
130 | class NL2SQLProgram(dspy.Module):
131 |     def __init__(self):
132 |         super().__init__()
133 |         self.generate = dspy.ChainOfThought(
134 |             NL2SQL
135 |         )  # hidden reasoning; SQL only as output
136 | 
137 |     def forward(self, schema: str, question: str):
138 |         return self.generate(schema=schema, question=question)
139 | 
140 | 
141 | # -----------------------------------------------------------------------------
142 | # 3) Questions + (optional) gold SQL
143 | # -----------------------------------------------------------------------------
144 | def questions_and_gold_sql() -> List[Tuple[str, Optional[str]]]:
145 |     base: List[Tuple[str, Optional[str]]] = [
146 |         (
147 |             "List the titles of books written by authors from Canada, alphabetically.",
148 |             "SELECT b.title FROM books b JOIN authors a ON a.id=b.author_id "
149 |             "WHERE a.country='Canada' ORDER BY b.title;",
150 |         ),
151 |         (
152 |             "Which author sold the most copies in 2024?",
153 |             "SELECT a.name FROM authors a "
154 |             "JOIN books b ON a.id=b.author_id "
155 |             "JOIN sales s ON s.book_id=b.id "
156 |             "WHERE s.year=2024 "
157 |             "GROUP BY a.name ORDER BY SUM(s.sold) DESC LIMIT 1;",
158 |         ),
159 |         (
160 |             "How many books per genre? Return genre and count, count descending.",
161 |             "SELECT genre, COUNT(*) AS n FROM books GROUP BY genre ORDER BY n DESC;",
162 |         ),
163 |         (
164 |             "What are the top 2 longest books by page count? Return title and pages.",
165 |             "SELECT title, pages FROM books ORDER BY pages DESC LIMIT 2;",
166 |         ),
167 |         (
168 |             "Average price of books published in or after 2010. Return a single number.",
169 |             "SELECT ROUND(AVG(price), 2) AS avg_price FROM books WHERE year >= 2010;",
170 |         ),
171 |         (
172 |             "List distinct countries represented by authors, alphabetically.",
173 |             "SELECT DISTINCT country FROM authors ORDER BY country;",
174 |         ),
175 |         (
176 |             "For Haruki Murakami, what is the average pages of his books? Return name and avg pages.",
177 |             "SELECT a.name, ROUND(AVG(b.pages), 1) AS avg_pages "
178 |             "FROM authors a JOIN books b ON a.id=b.author_id "
179 |             "WHERE a.name='Haruki Murakami' GROUP BY a.name;",
180 |         ),
181 |         (
182 |             "Find the cheapest Fantasy book (title + price).",
183 |             "SELECT title, price FROM books WHERE genre='Fantasy' ORDER BY price ASC LIMIT 1;",
184 |         ),
185 |         (
186 |             "Return titles that contain the word 'the' (case-insensitive), alphabetically.",
187 |             "SELECT title FROM books WHERE LOWER(title) LIKE '%the%' ORDER BY title;",
188 |         ),
189 |         (
190 |             "How many books did Margaret Atwood publish after 2000?",
191 |             "SELECT COUNT(*) AS n FROM books b "
192 |             "JOIN authors a ON a.id=b.author_id "
193 |             "WHERE a.name='Margaret Atwood' AND year > 2000;",
194 |         ),
195 |     ]
196 | 
197 |     # Harder variations to create headroom (ordering, aliasing, shape, top-k)
198 |     extras: List[Tuple[str, Optional[str]]] = [
199 |         (
200 |             "Return the top 3 authors by total copies sold in 2024 (name + total_sold), descending.",
201 |             "SELECT a.name, SUM(s.sold) AS total_sold "
202 |             "FROM authors a "
203 |             "JOIN books b ON b.author_id = a.id "
204 |             "JOIN sales s ON s.book_id = b.id "
205 |             "WHERE s.year = 2024 "
206 |             "GROUP BY a.name "
207 |             "ORDER BY total_sold DESC "
208 |             "LIMIT 3;",
209 |         ),
210 |         (
211 |             "List titles containing the word 'the' (case-insensitive), alphabetically, return them as column lower_title.",
212 |             "SELECT LOWER(title) AS lower_title "
213 |             "FROM books "
214 |             "WHERE LOWER(title) LIKE '%the%' "
215 |             "ORDER BY lower_title ASC;",
216 |         ),
217 |         (
218 |             "For each country, return country and number of authors as n_authors; sort by n_authors desc then country asc.",
219 |             "SELECT country, COUNT(*) AS n_authors "
220 |             "FROM authors "
221 |             "GROUP BY country "
222 |             "ORDER BY n_authors DESC, country ASC;",
223 |         ),
224 |         (
225 |             "What percent of books are Dystopia? Return a single number named pct_dystopia (1 decimal place).",
226 |             "SELECT ROUND(100.0 * SUM(CASE WHEN genre='Dystopia' THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct_dystopia "
227 |             "FROM books;",
228 |         ),
229 |     ]
230 | 
231 |     return base + extras
232 | 
233 | 
234 | def run_sql(conn: sqlite3.Connection, sql: str):
235 |     cur = conn.cursor()
236 |     cur.execute(sql)
237 |     rows = cur.fetchall()
238 |     cols = [d[0] for d in cur.description] if cur.description else []
239 |     return cols, rows
240 | 
241 | 
242 | def build_examples(conn: sqlite3.Connection, schema_text: str):
243 |     """Build DSPy examples and precompute gold results when available."""
244 |     examples = []
245 |     for q, gold_sql in questions_and_gold_sql():
246 |         expected = None
247 |         if gold_sql:
248 |             cols, rows = run_sql(conn, gold_sql)
249 |             ordered = "ORDER BY" in gold_sql.upper()
250 |             expected = {"columns": cols, "rows": rows, "ordered": ordered}
251 |         ex = dspy.Example(
252 |             schema=schema_text, question=q, expected=expected
253 |         ).with_inputs("schema", "question")
254 |         examples.append(ex)
255 |     return examples
256 | 
257 | 
258 | # -----------------------------------------------------------------------------
259 | # 4) Metric: safety + execution + strict(er) correctness + heuristic penalties
260 | # -----------------------------------------------------------------------------
261 | FORBIDDEN = re.compile(
262 |     r"\b(INSERT|UPDATE|DELETE|DROP|ALTER|PRAGMA|ATTACH|DETACH|CREATE|REPLACE|VACUUM|TRIGGER|INDEX|VIEW)\b",
263 |     flags=re.IGNORECASE,
264 | )
265 | 
266 | 
267 | def _clean_sql(text: str) -> str:
268 |     t = (text or "").strip()
269 |     t = re.sub(r"^```(?:sql)?", "", t, flags=re.IGNORECASE).strip()
270 |     t = re.sub(r"```$", "", t).strip()
271 |     # Keep only the first statement; terminate neatly if a semicolon was present
272 |     if ";" in t:
273 |         t = t.split(";")[0].strip() + ";"
274 |     return t
275 | 
276 | 
277 | def sql_metric(
278 |     gold: dspy.Example,
279 |     pred: dspy.Prediction,
280 |     trace=None,
281 |     pred_name=None,
282 |     pred_trace=None,
283 | ):
284 |     """
285 |     GEPA-friendly metric: returns {'score': float, 'feedback': str}.
286 | 
287 |     Scoring:
288 |       +0.4  single safe SELECT/WITH, no forbidden tokens
289 |       +0.3  executes without error
290 |       +0.3  exact result match (rows+columns); if not exact but sets equal → +0.15
291 |     Heuristic penalties (applied after success): -0.05 each for missing ORDER BY when asked,
292 |     wrong direction, missing LIMIT k when asked, missing DISTINCT when asked, wrong output shape for "single number".
293 |     """
294 |     sql_raw = getattr(pred, "sql", "") or ""
295 |     sql = _clean_sql(sql_raw)
296 |     question = getattr(gold, "question", "")
297 |     ql = question.lower()
298 | 
299 |     score = 0.0
300 |     fb = []
301 | 
302 |     # 1) Safety / format
303 |     if not (sql.lower().startswith("select") or sql.lower().startswith("with")):
304 |         fb.append("SQL must start with SELECT or WITH.")
305 |     elif FORBIDDEN.search(sql):
306 |         fb.append("Forbidden tokens present (DDL/DML/PRAGMA/etc.).")
307 |     else:
308 |         score += 0.4
309 | 
310 |     # 2) Execution
311 |     exec_cols, exec_rows, exec_err = [], [], None
312 |     if score >= 0.4:
313 |         try:
314 |             conn = (
315 |                 setup_db()
316 |             )  # fresh DB prevents accidental writes (even though we forbid DDL/DML)
317 |             cur = conn.cursor()
318 |             cur.execute(sql)
319 |             exec_rows = cur.fetchall()
320 |             exec_cols = [d[0] for d in cur.description] if cur.description else []
321 |             score += 0.3
322 |         except Exception as e:
323 |             exec_err = str(e)
324 |             fb.append(f"Execution error: {exec_err}")
325 | 
326 |     # 2.1) Heuristic penalties open headroom for GEPA
327 |     penalty = 0.0
328 |     if exec_err is None:
329 |         su = sql.upper()
330 | 
331 |         asks_order = any(
332 |             w in ql
333 |             for w in [
334 |                 "alphabet",
335 |                 "ascending",
336 |                 "descending",
337 |                 "top",
338 |                 "highest",
339 |                 "lowest",
340 |                 "most",
341 |             ]
342 |         )
343 |         if asks_order and "ORDER BY" not in su:
344 |             penalty += 0.05
345 |             fb.append("Question asks for ordering; add an ORDER BY.")
346 | 
347 |         if any(w in ql for w in ["descending", "highest", "top", "most", "largest"]):
348 |             if "ORDER BY" in su and "DESC" not in su:
349 |                 penalty += 0.05
350 |                 fb.append("Use ORDER BY ... DESC for descending/top/most queries.")
351 | 
352 |         if "alphabet" in ql or "ascending" in ql:
353 |             if "ORDER BY" in su and "DESC" in su:
354 |                 penalty += 0.05
355 |                 fb.append("Use ORDER BY ... ASC for alphabetical/ascending queries.")
356 | 
357 |         m = re.search(r"\btop\s+(\d+)\b", ql)
358 |         if m:
359 |             k = int(m.group(1))
360 |             if f"LIMIT {k}" not in su:
361 |                 penalty += 0.05
362 |                 fb.append(f"Add LIMIT {k} as requested (top {k}).")
363 | 
364 |         if "distinct" in ql and "DISTINCT" not in su:
365 |             penalty += 0.05
366 |             fb.append("Add DISTINCT as requested.")
367 | 
368 |         if "single number" in ql:
369 |             if not (len(exec_rows) == 1 and len(exec_cols) == 1):
370 |                 penalty += 0.05
371 |                 fb.append(
372 |                     "Return exactly one column and one row for 'single number' requests."
373 |                 )
374 | 
375 |     # 3) Correctness vs. gold (if provided)
376 |     s_correct = 0.0
377 |     gold_expected = getattr(gold, "expected", None)
378 |     if exec_err is None and gold_expected:
379 |         gold_rows = gold_expected["rows"]
380 |         gold_cols = gold_expected["columns"]
381 |         require_order = bool(gold_expected.get("ordered", False))
382 | 
383 |         same_cols = exec_cols == gold_cols
384 | 
385 |         if require_order:
386 |             same_rows = exec_rows == gold_rows
387 |         else:
388 |             same_rows = sorted(map(tuple, exec_rows)) == sorted(map(tuple, gold_rows))
389 | 
390 |         if same_cols and same_rows:
391 |             s_correct = 0.3
392 |         else:
393 |             # partial credit: set-equality on rows and columns
394 |             set_rows_equal = sorted(map(tuple, exec_rows)) == sorted(
395 |                 map(tuple, gold_rows)
396 |             )
397 |             set_cols_equal = set(exec_cols) == set(gold_cols)
398 |             if set_rows_equal and set_cols_equal:
399 |                 s_correct = 0.15
400 |                 if not same_cols:
401 |                     fb.append(
402 |                         "Column order/aliases differ; add explicit aliases to match expected columns."
403 |                     )
404 |                 if require_order and not same_rows:
405 |                     fb.append("Row order differs; add the correct ORDER BY.")
406 |             else:
407 |                 fb.append(
408 |                     f"Result mismatch. Expected (sample): {gold_rows[:3]} | Got (sample): {exec_rows[:3]}"
409 |                 )
410 | 
411 |     elif exec_err is None and not gold_expected:
412 |         # Unlabeled partial credit to keep signal flowing
413 |         if len(exec_rows) > 0:
414 |             s_correct = 0.15
415 |             fb.append(
416 |                 "No gold available; granting partial credit for non-empty result."
417 |             )
418 |         else:
419 |             fb.append("Query returned 0 rows; consider joins/filters.")
420 | 
421 |     score += s_correct
422 |     score = max(0.0, min(1.0, score - penalty))  # apply penalties, clamp to [0, 1]
423 | 
424 |     if score == 1.0:
425 |         fb.append("Perfect score. Keep current strategy.")
426 |     elif score < 0.4:
427 |         fb.append("Rewrite as ONE safe SELECT/WITH; avoid DDL/DML/PRAGMA.")
428 | 
429 |     return {"score": float(score), "feedback": "\n".join(fb)}
430 | 
431 | 
432 | def sql_metric_scalar(gold, pred, trace=None, pred_name=None, pred_trace=None):
433 |     """Numeric-only wrapper compatible with GEPA's 5-arg metric signature."""
434 |     return float(
435 |         sql_metric(
436 |             gold,
437 |             pred,
438 |             trace=trace,
439 |             pred_name=pred_name,
440 |             pred_trace=pred_trace,
441 |         )["score"]
442 |     )
443 | 
444 | 
445 | def sql_metric_dual(gold, pred, trace=None, pred_name=None, pred_trace=None):
446 |     """
447 |     Dual-mode metric for older GEPA builds:
448 |         - Evaluation path (no trace): return float score
449 |         - Reflection path (has trace/pred_name/pred_trace): return {'score', 'feedback'}
450 |     """
451 |     res = sql_metric(
452 |         gold, pred, trace=trace, pred_name=pred_name, pred_trace=pred_trace
453 |     )
454 |     # If called by Evaluate (no reflection context), return float only:
455 |     if (trace is None) and (pred_name is None) and (pred_trace is None):
456 |         return float(res["score"])
457 |     # If called by GEPA reflection, return rich feedback:
458 |     return res
459 | 
460 | 
461 | # -----------------------------------------------------------------------------
462 | # 5) Main — build data, baseline eval, GEPA optimization, post eval
463 | # -----------------------------------------------------------------------------
464 | def main():
465 |     if GEPA is None:
466 |         raise RuntimeError(
467 |             "Could not import GEPA from DSPy. Please upgrade/install a DSPy version that includes GEPA."
468 |         )
469 | 
470 |     # Build schema and examples
471 |     conn = setup_db()
472 |     schema_text = describe_schema(conn)
473 |     all_examples = build_examples(conn, schema_text)
474 | 
475 |     # Train/dev split (tweak to taste)
476 |     trainset = all_examples[:8]
477 |     devset = all_examples[8:]
478 | 
479 |     # Baseline program
480 |     program = NL2SQLProgram()
481 | 
482 |     # Baseline detailed feedback on devset
483 |     print("== Baseline on devset ==")
484 |     baseline_scores = []
485 |     for ex in devset:
486 |         pred = program(schema=ex.schema, question=ex.question)
487 |         res = sql_metric(ex, pred)
488 |         baseline_scores.append(res["score"])
489 |         print("\nQ:", ex.question)
490 |         print("SQL:\n", _clean_sql(getattr(pred, "sql", "")))
491 |         print(f"Score: {res['score']:.3f}")
492 |         print("Feedback:\n" + res["feedback"])
493 |     print(f"\nBaseline mean score: {sum(baseline_scores) / len(baseline_scores):.3f}")
494 | 
495 |     # GEPA optimizer (reflective prompt evolution)
496 |     # --- Compatibility wiring: metric must be numeric; feedback wired if supported.
497 |     gepa_kwargs = dict(
498 |         metric=sql_metric_scalar,  # <-- numeric metric (prevents the dict summation crash)
499 |         auto="medium",
500 |         reflection_lm=reflection_lm,
501 |         track_stats=True,
502 |         add_format_failure_as_feedback=True,
503 |     )
504 | 
505 |     init_sig = inspect.signature(GEPA.__init__).parameters
506 |     if "feedback_metric" in init_sig:
507 |         gepa_kwargs["feedback_metric"] = sql_metric
508 |     elif "feedback_producer" in init_sig:  # some older builds
509 |         gepa_kwargs["feedback_producer"] = sql_metric
510 |     else:
511 |         print(
512 |             "⚠️  GEPA build does not expose 'feedback_metric'/'feedback_producer'. "
513 |             "Optimization will use the scalar metric only (still works, less rich guidance)."
514 |         )
515 | 
516 |     gepa = GEPA(**gepa_kwargs)
517 | 
518 |     optimized_program = gepa.compile(program, trainset=trainset, valset=devset)
519 | 
520 |     # Post‑GEPA detailed feedback on devset
521 |     print("\n== Post‑GEPA on devset ==")
522 |     post_scores = []
523 |     for ex in devset:
524 |         pred = optimized_program(schema=ex.schema, question=ex.question)
525 |         res = sql_metric(ex, pred)
526 |         post_scores.append(res["score"])
527 |         print("\nQ:", ex.question)
528 |         print("SQL:\n", _clean_sql(getattr(pred, "sql", "")))
529 |         print(f"Score: {res['score']:.3f}")
530 |         print("Feedback:\n" + res["feedback"])
531 |     print(f"\nPost‑GEPA mean score: {sum(post_scores) / len(post_scores):.3f}")
532 | 
533 |     # Quick before/after on a single sample
534 |     sample = devset[0]
535 |     before = program(schema=sample.schema, question=sample.question)
536 |     after = optimized_program(schema=sample.schema, question=sample.question)
537 |     print("\n== Before/After example ==")
538 |     print("Q:", sample.question)
539 |     print("Before SQL:\n", _clean_sql(getattr(before, "sql", "")))
540 |     print("After  SQL:\n", _clean_sql(getattr(after, "sql", "")))
541 | 
542 | 
543 | if __name__ == "__main__":
544 |     main()
545 | 


--------------------------------------------------------------------------------
/dspy-gepa-researcher/report.md:
--------------------------------------------------------------------------------
  1 | # Executive Summary
  2 | 
  3 | The global economic and policy landscape is undergoing a fundamental transformation, moving away from the limitations of the Washington Consensus toward a new framework—the London Consensus—that prioritizes social factors, institutional resilience, and national happiness as core measures of success [1, 2, 3, 6].
  4 | 
  5 | Technological disruption is accelerating this change. The rapid adoption of AI and advanced computing is driving intense global competition for technological leadership [1]. This innovation underpins the exponential growth of Industry 5.0, a market valued at over $51.5 billion in 2023, which emphasizes human-centric, sustainable, and resilient industrial production. Policymakers must recognize the projected $29 trillion to $48 trillion revenue potential of emerging industry arenas by 2040 [2].
  6 | 
  7 | Simultaneously, geopolitical uncertainty poses significant risks, particularly within the global energy system, despite the ongoing transition driven by clean technologies like solar and wind [19, 20, 21, 22]. Addressing these fragilities requires massive capital deployment; an estimated $106 trillion in cumulative investment is needed through 2040 for global infrastructure alone [3]. Decision-makers must align strategic planning with the urgent climate imperative identified by the IPCC [4], focusing on transformative, equitable action while navigating internal policy divides and adapting investment strategies to manage geopolitical volatility and capitalize on AI-driven private markets.
  8 | 
  9 | # Market Landscape
 10 | 
 11 | The global technology market landscape is defined by rapid innovation, intense geopolitical competition, and a strategic shift toward efficiency-driven digital transformation [3, 4, 14]. Following macroeconomic challenges in 2022 and 2023, the sector is poised for robust growth, driven by enterprise investment in automation and strategic technologies [16, 4, 20].
 12 | 
 13 | ## 2023–2025 Trends and Growth Drivers
 14 | 
 15 | The market space is dominated by software and IT services, which are expected to capture up to **70% of global tech spending** by 2027/2029 [3, 4, 5]. Overall worldwide IT spending was projected at $4.7 trillion in 2023 and is forecasted to grow 8% to reach **$5.1 trillion in 2024** [16, 4]. Forecasts for 2025 range up to $5.8 trillion [5].
 16 | 
 17 | The primary drivers of this growth are:
 18 | 
 19 | 1.  **Generative AI (GenAI) and Automation:** GenAI experienced explosive growth in 2023, with one-third of surveyed organizations using it regularly [6]. While initial adoption focused on proof-of-concept, its material impact on IT budgets is expected to begin in 2025 [7]. Global AI spending is projected to reach **$407 billion in 2025**, representing a 28.6% increase [5]. This demand is fueling the AI accelerators market, which is projected to expand at a robust **CAGR of 44.1%** between 2024 and 2029 [8].
 20 | 2.  **Cloud Computing and Infrastructure:** Cloud spending remains a core growth engine. The cloud computing industry is forecast to grow to **$678 billion in 2025** [5]. Furthermore, the Data Center Systems segment is set to grow by 15.5% in 2025, driven by the need for increased computing power [9].
 21 | 3.  **Cybersecurity and Modernization:** Organizations are prioritizing cybersecurity and the modernization of legacy systems to improve operational efficiency and address persistent IT talent shortages [1, 3, 5].
 22 | 
 23 | ## Macroeconomic Context
 24 | 
 25 | While the technology outlook is optimistic, the global business landscape faces continued uncertainty. The 2025 outlook is characterized by both promise (potential soft landings and descending interest rates) and peril (intensifying geopolitical conflicts and trade tensions) [9, 11]. Geopolitical factors are increasingly shaping production choices, particularly in strategic areas like semiconductors and sovereign AI advancements [9, 31]. Investment opportunities are anticipated to be strongest in technology, renewable energy, and healthcare [10].
 26 | 
 27 | # Key Players & Differentiation
 28 | 
 29 | The generative AI market is characterized by a crowded vendor landscape, broadly categorized into foundation model providers, infrastructure providers, and application developers [11]. Differentiation among foundation model providers is achieved through performance, safety focus, open-source strategy, and integration capabilities [8, 35].
 30 | 
 31 | The following table compares six leading foundation model developers based on their core offerings and strategic focus:
 32 | 
 33 | | Player | Key Models | Distinctive Capabilities |
 34 | | :--- | :--- | :--- |
 35 | | **OpenAI** | GPT-4.1, GPT-4 Turbo | Industry-leading performance in English contexts; optimized and scalable integration via Microsoft Azure [1, 25]. |
 36 | | **Google** | Gemini Pro, Gemma 2 | Comprehensive AI platform (Vertex AI); strong multimodal support and open-source model releases [25, 35]. |
 37 | | **Anthropic** | Claude-3 Series | Focus on safe and ethical AI development, utilizing Constitutional AI principles [12]. |
 38 | | **Meta** | Llama 3.1 | Leading provider of high-performing, open-source foundation models for community adoption [13]. |
 39 | | **Mistral AI** | Mistral Large 2 | Focus on highly efficient, performant, and open models, often optimized for European markets [3, 35]. |
 40 | | **Baidu** | Ernie-Bot 4 | Recognized as the leading LLM provider specifically tailored for Chinese language contexts [14]. |
 41 | 
 42 | ## Objective Benchmarks and Metrics
 43 | 
 44 | Differentiation is measured across several objective benchmarks, moving beyond simple performance scores to include safety, efficiency, and enterprise readiness:
 45 | 
 46 | 1.  **Contextual Performance Leadership:** Performance varies significantly across languages and tasks. GPT 4-Turbo is consistently identified as being at the forefront in English contexts, while Baidu’s Ernie-Bot 4 leads the field in Chinese contexts [14].
 47 | 2.  **Technical Differentiation Metrics:** Models are rigorously tracked and benchmarked across numerous technical specifications. Key metrics used for comparison include Safety Rank, Jailbreaking Resistance, and specific task performance benchmarks such as GPQA and Math LiveBench [15].
 48 | 3.  **Enterprise and API Capabilities:** Providers differentiate their offerings based on features critical for enterprise adoption. These include structured output, tool use, vision/audio understanding, prompt caching, and fine-tuning capabilities, often delivered through integrated cloud platforms like Google's Vertex AI or Microsoft Azure [23, 25].
 49 | 
 50 | # Risks & Open Questions
 51 | 
 52 | ## Knowledge Gaps and Unforeseen Threats
 53 | 
 54 | *   **Unknown Unknowns:** Risks that are unforeseen and unanticipated because they fall outside the realm of current knowledge or past experience, often arising from complex interdependencies [16]. These are particularly dangerous in cybersecurity, where assets are unknown and thus impossible to secure [17].
 55 | *   **Internal Blind Spots:** The most significant risks often stem from knowledge gaps that organizations are unaware they possess. Many so-called unknown unknowns were actually knowable if key information held by project personnel had been communicated to top decision-makers [18].
 56 | *   **Data Analysis Failure:** Most companies analyze only 12% of collected data, leaving 88% of competitive threats and opportunities unnoticed, which can lead to significant losses and competitive blind spots [19].
 57 | *   **Competitive Displacement:** Established firms risk being displaced by smaller rivals due to internal routinism, outdated R&D opinions, and limited perception focused only on themselves, failing to conduct fact-based assessments of competitor threats [23, 24].
 58 | 
 59 | ## Regulatory and Legal Friction
 60 | 
 61 | *   **Challenged Regulatory Authority:** The Supreme Court's decision to overrule the *Chevron* deference framework opens the door to new legal challenges against federal regulations, potentially having enduring effects on agencies' ability to implement policies [20].
 62 | *   **Outdated Frameworks:** Developments in regenerative medicine creating 'hybrid therapies' (combining medicines, devices, and human cells) pose significant challenges to regulatory frameworks that traditionally regulated these components separately [21].
 63 | *   **Global Hurdles:** Global regulatory hurdles for medical devices pose risks to innovation and market access [22]. Furthermore, regulatory challenges have been cited as the primary reason why certain payment innovation initiatives, such as Open Banking, have not fully met expectations [23].
 64 | *   **Unregulated Services:** The traditional regulatory system for the legal profession is ineffective at fostering necessary collaboration with nonlaw professionals, especially as nonlawyers provide services online without regulation [24].
 65 | *   **Regulatory Costs:** Optimal regulatory rule-making is complicated by the fact that firms can challenge regulations in court, forcing regulators to incur costs to preclude challenges or accept legal fees [25].
 66 | 
 67 | ## Technological Scaling and Systemic Failure
 68 | 
 69 | *   **AI Compute Bottlenecks:** Scaling computational resources for AI model training faces limitations over the next decade due to bottlenecks in the complex, internationally distributed supply chain for semiconductors [26]. Constraints in human capital and power are projected to limit cumulative AI chip manufacturing capacity, potentially reducing the likelihood of some near-term transformative AI scenarios [26].
 70 | *   **Agentic Failure Modes:** Agentic AI systems are subject to various failure modes, including specific threats like a 'Memory poisoning attack' on an AI email assistant, which require analysis of their effects and potential mitigations [27].
 71 | *   **Inevitable Failure:** Failure Mode Analysis (FMA) is based on the key tenet that failures are inevitable, even with multiple layers of resiliency, particularly in complex environments [28].
 72 | 
 73 | ## Ethical and Unintended Consequences
 74 | 
 75 | *   **Systemic Intervention Risks:** Interventions like Nature-based Solutions (NbS) carry systemic risks, including ecological side effects, socio-political inequities, and epistemological overconfidence, challenging the assumption that such interventions are inherently beneficial [29].
 76 | *   **Sustainable Development Backlash:** Sustainable development initiatives often lead to unintended consequences within social-ecological systems, such as those related to conservation efforts or alternative livelihood programs [30].
 77 | *   **Regulatory Side Effects:** Measures intended to combat money laundering and terrorist financing (AML/CFT) can result in unintended consequences, including derisking, financial exclusion, and the suppression of Non-Profit Organizations (NPOs) and human rights [31].
 78 | *   **AI Ethical Challenges:** Emerging technologies, especially AI, are not ethically neutral [32]. Ethical risks associated with Artificial Intelligence in Education (AIED) are categorized across technology (e.g., algorithmic bias, privacy invasion), education (e.g., homogenized development), and society (e.g., digital divide, lack of accountability) [33].
 79 | *   **Dynamic Ethical Factors:** Business leaders must be aware of dynamic ethical risk factors, which are often driven by global geopolitical and economic events (e.g., inflation, cost of living crises) that pose reputational, operational, and financial risks [34].
 80 | 
 81 | # Outlook (12–24 months)
 82 | 
 83 | The 12-to-24-month outlook suggests a period of decelerating US growth, stabilizing inflation in developed markets, continued geopolitical complexity, and accelerated technological competition in key sectors.
 84 | 
 85 | ### 1. US Monetary Easing and Decelerated Growth (2025–2026)
 86 | 
 87 | The US economy is forecasted to experience decelerated growth, with real GDP projected at 0.9% in 2025, maintaining a 30% to 50% probability of recession [35]. This environment will prompt the Federal Reserve to implement further rate cuts. The median forecast projects the federal funds target range to reach 3.926% by the end of 2025 and 3.625% by the end of 2026 [2, 21]. Core PCE inflation is expected to end 2025 at 3.1% year-over-year [35].
 88 | 
 89 | *   **Leading Indicators to Track:** Real GDP growth figures, Core PCE inflation rate, FOMC dot plot updates, and the unemployment rate (expected to stabilize around 4.4% in 2026) [2, 20].
 90 | 
 91 | ### 2. European Economic Recovery and Inflation Stabilization (2025–2026)
 92 | 
 93 | The Euro area economy is expected to strengthen, with real GDP growth accelerating to 1.2% in 2025 and 1.4% in 2026 [36]. This recovery is anticipated to pull the Austrian economy out of its post-WWII recession by mid-2025 [37]. Headline HICP inflation in the Euro area is expected to decline and stabilize near 1.9% throughout both 2025 and 2026 [36].
 94 | 
 95 | *   **Leading Indicators to Track:** Euro area HICP inflation, Euro area real GDP growth, and unemployment rates (expected to average 6.5% in 2025) [36].
 96 | 
 97 | ### 3. Equity Market Highs and Dollar Weakness (2025)
 98 | 
 99 | US equity markets are projected to continue their upward trajectory, with the S&P 500 expected to close near 6,000 by year-end 2025, supported by double-digit earnings growth [38]. Concurrently, the US Dollar is expected to weaken against Emerging Market (EM) currencies as US economic exceptionalism fades [38]. Ongoing trade policy shifts are anticipated to cause a broad-based downshift in global growth while shifting inflation pressures toward the U.S. [38].
100 | 
101 | *   **Leading Indicators to Track:** S&P 500 earnings growth reports, DXY index performance, and EM currency performance [38].
102 | 
103 | ### 4. Intensified Geopolitical and Regulatory Complexity (2025–2026)
104 | 
105 | Overall uncertainty, driven by macroeconomic and geopolitical disruptions, is expected to continue or deepen through 2026 [6, 3]. Geopolitical complexity will be heightened by the global elections supercycle and new areas of competition, particularly Artificial Intelligence (AI) and the oceans [39]. Businesses must navigate diverging AI policies and potential sweeping US tariff proposals, which risk raising CPI inflation by over 1 percentage point [15, 17]. Strategic realignment and supply chain adjustments will be necessary to manage compliance risks and reliance on critical mineral supply chains [15, 17].
106 | 
107 | *   **Leading Indicators to Track:** Implementation status of new US tariff proposals, regulatory divergence indices concerning AI, and critical mineral supply chain stability metrics [15, 17].
108 | 
109 | ### 5. Accelerated Growth in Next-Generation Biotherapeutics (2025–2026)
110 | 
111 | The Global Biotechnology Market, valued at $1.8 trillion in 2024, is expected to sustain strong growth (13.543% CAGR through 2034) [40]. The Next Generation Biotherapeutics segment, specifically, is projected to grow at a 12.05% CAGR (2026–2030), fueled by technological advancements in platforms like CRISPR gene editing, mRNA delivery, and synthetic biology [41]. Global regulatory bodies are expected to continue expediting market access for these innovative treatments, shortening time-to-market [41].
112 | 
113 | *   **Leading Indicators to Track:** Number of new Breakthrough Therapy Designations granted by regulatory bodies, R&D investment in synthetic biology platforms, and revenue growth of key biotechnology firms [41].
114 | 
115 | ## References
116 | [1] McKinsey technology trends outlook 2024 — mckinsey.com (published 2024-07-16). https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/the-top-trends-in-tech?utm_source=pocket_shared
117 | [2] The next big arenas of competition — mckinsey.com (published 2024-10-01). https://www.mckinsey.com/~/media/mckinsey/mckinsey%20global%20institute/our%20research/the%20next%20big%20arenas%20of%20competition/the-next-big-arenas-of-competition_final.pdf
118 | [3] Investing in the infrastructure of modern society — mckinsey.com (published 2025-09-09). https://www.mckinsey.com/industries/infrastructure/our-insights/the-infrastructure-moment
119 | [4] Climate Change 2023 Synthesis Report — ipcc.ch. https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_LongerReport.pdf
120 | [5] Technology Growth Statistics 2025: Market Size, AI, and Innovation — sqmagazine.co.uk (published 2025-05-14). https://sqmagazine.co.uk/technology-growth-statistics/
121 | [6] The state of AI in 2023: Generative AI’s breakout year — mckinsey.com (published 2023-08-01). https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-state-of-ai-in-2023-generative-ais-breakout-year?utm_source=substack&utm_medium=email
122 | [7] Gartner Forecasts Worldwide IT Spending to Grow 8% in 2024 — gartner.com (published 2023-10-18). https://www.gartner.com/en/newsroom/press-releases/2023-10-18-gartner-forecasts-worldwide-it-spending-to-grow-8-percent-in-2024
123 | [8] AI Accelerators Market Research 2025-2029 — globenewswire.com (published 2025-08-15). https://www.globenewswire.com/news-release/2025/08/15/3134075/28124/en/AI-Accelerators-Market-Research-2025-2029-Market-to-Expand-at-44-1-CAGR-Driven-by-Sovereign-AI-and-Geopolitical-Factors.html
124 | [9] Gartner Forecasts Worldwide IT Spending to Grow 9.3% in 2025 — gartner.com (published 2024-10-23). https://www.gartner.com/en/newsroom/press-releases/2024-10-23-gartner-forecasts-worldwide-it-spending-to-grow-nine-point-three-percent-in-2025
125 | [10] Industry outlook 2025 — eiu.com (published 2024-10-24). https://www.eiu.com/n/campaigns/industry-outlook-2025/
126 | [11] Navigating the generative AI vendor landscape — transformainsights.com (published 2024-01-30). https://transformainsights.com/research/reports/navigating-generative-ai-vendor-landscape
127 | [12] LLM Price Comparison | Compare AI Model Costs — llmpricecomparison.com (published 2025-01-01). https://llmpricecomparison.com/providers
128 | [13] The Vanguard of Open-Source LLMs: A Comprehensive Analysis (2024–2025) — medium.com (published 2025-08-04). https://medium.com/@haiderkhan6410/the-vanguard-of-open-source-llms-a-comprehensive-analysis-2024-2025-a5805592fe8f
129 | [14] Unveiling the Competitive Dynamics: A Comparative Evaluation of American and Chinese LLMs — papers.ssrn.com (published 2024-05-10). https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4823501
130 | [15] LLM Decision Hub - AI Model Rankings & Benchmarks — llmleaderboard.ai (published 2025-03-01). https://llmleaderboard.ai/leaderboard/
131 | [16] Navigating the “Unknown Unknowns” Risks — linkedin.com (published 2024-10-23). https://www.linkedin.com/pulse/navigating-unknown-unknowns-risks-st-mmt-ipm-pmp-pmi-rmp-uustc
132 | [17] What are Unknown Unknowns | Cybersecurity Glossary — cycognito.com (published 2025-01-01). https://www.cycognito.com/glossary/unknown-unknowns.php
133 | [18] Looking in the right places to identify “unknown unknowns ... — i2insights.org (published 2019-11-05). https://i2insights.org/2019/11/05/detecting-unknown-unknowns-in-projects/
134 | [19] Blind Spot: Uncovering Hidden Competitive Threats — octopusintelligence.com (published 2025-09-19). https://www.octopusintelligence.com/the-2-3-million-blind-spot-why-73-of-companies-miss-their-biggest-competitive-threats-until-its-too-late/
135 | [20] Supreme Court Opens Door to New Legal Challenges ... — mcguirewoods.com (published 2024-07-03). https://www.mcguirewoods.com/client-resources/alerts/2024/7/supreme-court-opens-door-to-new-legal-challenges-to-federal-regulations-new-and-old/
136 | [21] Legal and Regulatory Challenges for Emerging Regenerative... : Transplantation — journals.lww.com (published 2025-05-13). https://journals.lww.com/transplantjournal/fulltext/2024/05000/legal_and_regulatory_challenges_for_emerging.15.aspx
137 | [22] Global Regulatory Challenges for Medical Devices: Impact on Innovation and Market Access — mdpi.com (published 2024-10-12). https://www.mdpi.com/2076-3417/14/20/9304
138 | [23] UK Payment Innovations: Regulatory Hurdles in Open Banking — edgardunn.com (published 2024-11-01). https://www.edgardunn.com/articles/how-much-regulation-is-needed
139 | [24] Regulatory Hurdles--21st Century Law — michbar.org (published 2021-01-01). https://www.michbar.org/future/regulation
140 | [25] Regulatory Rule-Making with Legal Challenges — sciencedirect.com (published 2000-08-10). https://www.sciencedirect.com/science/article/pii/S0095069699911087
141 | [26] Scaling Limits to AI Chip Manufacturing — openreview.net (published 2025-05-13). https://openreview.net/pdf?id=jwhrNLkEuk
142 | [27] Taxonomy of Failure Mode in Agentic AI Systems — cdn-dynmedia-1.microsoft.com (published 2025-04-16). https://cdn-dynmedia-1.microsoft.com/is/content/microsoftcorp/microsoft/final/en-us/microsoft-brand/documents/Taxonomy-of-Failure-Mode-in-Agentic-AI-Systems-Whitepaper.pdf
143 | [28] Architecture strategies for performing failure mode analysis — learn.microsoft.com (published 2024-10-08). https://learn.microsoft.com/en-us/azure/well-architected/reliability/failure-mode-analysis
144 | [29] Rethinking Nature-Based Solutions: Unintended Consequences, Ancient Wisdom, and the Limits of Nature — mdpi.com (published 2025-06-13). https://www.mdpi.com/2073-445X/14/6/1272
145 | [30] Unintended consequences of sustainable development ... — ecologyandsociety.org (published 2022-05-26). https://ecologyandsociety.org/vol27/iss2/art10/
146 | [31] Mitigating Unintended Consequences — fatf-gafi.org (published 2024-07-31). https://www.fatf-gafi.org/en/publications/Financialinclusionandnpoissues/Unintended-consequences-project.html
147 | [32] Ethical Considerations and Challenges of AI Adoption in Project Management — link.springer.com (published 2025-03-18). https://link.springer.com/chapter/10.1007/978-3-031-56310-2_16?error=cookies_not_supported&code=c39f2730-ee66-4e00-ac89-8e3a3ce785f6
148 | [33] a systematic review on identifying and mitigating ethical risks — nature.com (published 2025-07-16). https://www.nature.com/articles/s41599-025-05252-6
149 | [34] What are the ethical risk factors business leaders are most ... — ibe.org.uk (published 2024-02-19). https://www.ibe.org.uk/resource/what-are-the-ethical-risk-factors-business-leaders-are-most-concerned-about-in-2024.html
150 | [35] SIFMA 2025 Mid-Year Economic Survey — sifma.org (published 2025-07-11). https://www.sifma.org/wp-content/uploads/2024/12/1H25-Economic-Survey-Report_2025-07-11.pdf
151 | [36] The ECB Survey of Professional Forecasters - Fourth quarter of 2024 — ecb.europa.eu (published 2024-10-18). https://www.ecb.europa.eu/stats/ecb_surveys/survey_of_professional_forecasters/html/ecb.spf2024q4~ee6e2cd847.en.html
152 | [37] Austria is in its Third Year of Recession. Economic Outlook for 2025 and 2026 — wifo.ac.at (published 2025-05-16). https://www.wifo.ac.at/wp-content/uploads/upload-8802/roa_2025_05_economic_outlook.pdf
153 | [38] Mid-year market outlook 2025 | J.P. Morgan Research — jpmorgan.com (published 2025-07-01). https://www.jpmorgan.com/insights/global-research/outlook/mid-year-outlook
154 | [39] Top 10 geopolitical risks for 2024 — ey.com (published 2025-06-11). https://www.ey.com/en_gl/insights/geostrategy/2024-geostrategic-outlook
155 | [40] Biotechnology Market Outlook Report: Industry Size, Market Shares Data, Latest Trends, Insights, Growth Potential, CAGR Forecasts to 2034 — researchandmarkets.com (published 2025-01-01). https://www.researchandmarkets.com/reports/6027439/biotechnology-market-outlook-report-industry?srsltid=AfmBOoriaKomZjCF_-4pG3kRYUgNfiEPX3VKmlGYgtKGK_qZKNubnpLf
156 | [41] Next Generation Biotherapeutics Market to Grow with a CAGR of 12.05% through 2030 — techsciresearch.com (published 2025-01-01). https://techsciresearch.com/news/6646-next-generation-biotherapeutics-market.html


--------------------------------------------------------------------------------
/dspy-gepa-researcher/dspy_gepa_researcher.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import re
  5 | import json
  6 | import asyncio
  7 | import operator
  8 | from typing import Any, Dict, List, Optional, Tuple, Annotated
  9 | from urllib.parse import urlparse
 10 | from pydantic import BaseModel, Field
 11 | from dateutil import parser as dtparse
 12 | 
 13 | from typing_extensions import TypedDict
 14 | from langgraph.graph import StateGraph, START, END
 15 | from langgraph.types import Send
 16 | 
 17 | import dspy
 18 | from dspy.teleprompt import GEPA
 19 | 
 20 | from exa_py import Exa
 21 | 
 22 | # ----------------------------
 23 | # Configuration
 24 | # ----------------------------
 25 | 
 26 | MAX_ROUNDS = int(os.environ.get("RR_MAX_ROUNDS", "1"))              # writer<->research loop rounds
 27 | SEARCH_RESULTS_PER_QUERY = int(os.environ.get("RR_SEARCH_K", "6"))  # per query
 28 | MAX_CONTENT_CHARS_PER_SOURCE = int(os.environ.get("RR_MAX_CHARS", "12000"))
 29 | 
 30 | WRITER_MODEL = os.environ.get("GEMINI_WRITER_MODEL", "gemini/gemini-flash-latest")
 31 | RESEARCH_MODEL = os.environ.get("GEMINI_RESEARCH_MODEL", "gemini/gemini-flash-latest")
 32 | REFLECTION_MODEL = os.environ.get("GEMINI_REFLECTION_MODEL", WRITER_MODEL)
 33 | 
 34 | FALLBACK_WRITER = "gemini/gemini/gemini-flash-latest"
 35 | FALLBACK_RESEARCH = "gemini/gemini/gemini-flash-latest"
 36 | 
 37 | GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 38 | EXA_API_KEY = os.environ.get("EXA_API_KEY")
 39 | 
 40 | if not GEMINI_API_KEY:
 41 |     raise RuntimeError("GEMINI_API_KEY is not set.")
 42 | if not EXA_API_KEY:
 43 |     raise RuntimeError("EXA_API_KEY is not set. Get one from dashboard.exa.ai")
 44 | 
 45 | # Initialize Exa once (thread-safe to call via to_thread)
 46 | EXA = Exa(EXA_API_KEY)
 47 | 
 48 | # ----------------------------
 49 | # Utilities & data models
 50 | # ----------------------------
 51 | 
 52 | def short_host(u: str) -> str:
 53 |     try:
 54 |         return urlparse(u).netloc.replace("www.", "")
 55 |     except Exception:
 56 |         return u
 57 | 
 58 | def clamp(n: float, lo=0.0, hi=1.0) -> float:
 59 |     return max(lo, min(hi, n))
 60 | 
 61 | def safe_json_loads(s: str, fallback=None):
 62 |     try:
 63 |         return json.loads(s)
 64 |     except Exception:
 65 |         return fallback
 66 | 
 67 | class SectionSpec(BaseModel):
 68 |     name: str
 69 |     instructions: str
 70 | 
 71 | class SourceDoc(BaseModel):
 72 |     url: str
 73 |     title: Optional[str] = None
 74 |     site: Optional[str] = None
 75 |     published: Optional[str] = None
 76 |     content: Optional[str] = None
 77 | 
 78 | class ResearchSummary(BaseModel):
 79 |     section: str
 80 |     query: str
 81 |     bullets: List[str] = Field(default_factory=list)
 82 |     sources: List[SourceDoc] = Field(default_factory=list)
 83 | 
 84 | class ReviewReport(BaseModel):
 85 |     pass_checks: bool
 86 |     summary: str
 87 |     issues: List[str] = Field(default_factory=list)
 88 |     suggestions: List[str] = Field(default_factory=list)
 89 | 
 90 | class EvalResult(BaseModel):
 91 |     score: float
 92 |     breakdown: Dict[str, float]
 93 |     notes: str
 94 | 
 95 | # ----------------------------
 96 | # DSPy model setup
 97 | # ----------------------------
 98 | 
 99 | def _make_lm(model_name: str, api_key: str, temperature: float = 0.3, model_type: str = "chat", max_tokens: int = 65536):
100 |     """Create a DSPy LM via LiteLLM provider strings (e.g., 'gemini/gemini-2.5-pro-preview-03-25')."""
101 |     try:
102 |         return dspy.LM(model_name, api_key=api_key, temperature=temperature, model_type=model_type, max_tokens=max_tokens)
103 |     except Exception:
104 |         if "pro" in model_name:
105 |             return dspy.LM(FALLBACK_WRITER, api_key=api_key, temperature=temperature, model_type=model_type, max_tokens=max_tokens)
106 |         return dspy.LM(FALLBACK_RESEARCH, api_key=api_key, temperature=temperature, model_type=model_type, max_tokens=max_tokens)
107 | 
108 | WRITER_LM   = _make_lm(WRITER_MODEL, GEMINI_API_KEY, temperature=0.2)
109 | RESEARCH_LM = _make_lm(RESEARCH_MODEL, GEMINI_API_KEY, temperature=0.4)
110 | REFLECT_LM  = _make_lm(REFLECTION_MODEL, GEMINI_API_KEY, temperature=0.8)
111 | 
112 | dspy.configure(lm=RESEARCH_LM, cache=True)
113 | 
114 | # ----------------------------
115 | # DSPy Signatures (instructions)
116 | # ----------------------------
117 | 
118 | class QueryGenSig(dspy.Signature):
119 |     """Produce 4–8 diverse Exa search queries for a section (use quoted phrases, site:, intitle:, date ranges). Return a JSON list of strings."""
120 |     section_title = dspy.InputField()
121 |     section_instructions = dspy.InputField()
122 |     queries_json = dspy.OutputField()
123 | 
124 | class SummarizeSig(dspy.Signature):
125 |     """Summarize source texts into evidence bullets for the section.
126 |     OUTPUT JSON: {"bullets": ["...", "..."]}. Cite as [S#] (matching the per-query ordering). Keep bullets concise & factual.
127 |     """
128 |     prompt = dspy.InputField()
129 |     sources_digest = dspy.InputField()
130 |     output_json = dspy.OutputField()
131 | 
132 | class WriteSectionSig(dspy.Signature):
133 |     """Write a polished Markdown section '# {section_title}' using [n] numeric citations only. Avoid bare URLs. Return ONLY the section Markdown."""
134 |     section_title = dspy.InputField()
135 |     section_instructions = dspy.InputField()
136 |     evidence_digest = dspy.InputField()
137 |     output_markdown = dspy.OutputField()
138 | 
139 | class GapAnalysisSig(dspy.Signature):
140 |     """Given current bullets, decide if more research is needed. OUTPUT JSON: {"need_more": bool, "followup_queries": ["..."]}"""
141 |     section_title = dspy.InputField()
142 |     bullets_digest = dspy.InputField()
143 |     output_json = dspy.OutputField()
144 | 
145 | class CiteFixSig(dspy.Signature):
146 |     """Fix citations: ensure only [n] numeric citations (no [S#] or raw URLs). Return ONLY the corrected Markdown body."""
147 |     markdown_body = dspy.InputField()
148 |     id_map_notes = dspy.InputField()
149 |     fixed_markdown = dspy.OutputField()
150 | 
151 | class ReviewSig(dspy.Signature):
152 |     """Review the full report for coverage, correctness, clarity, neutrality, structure, citation hygiene. OUTPUT JSON: {pass_checks, issues, suggestions, summary}"""
153 |     report_md = dspy.InputField()
154 |     output_json = dspy.OutputField()
155 | 
156 | class ReviseSig(dspy.Signature):
157 |     """Apply review suggestions to the report without adding new unsupported facts. Return the improved Markdown body (no References)."""
158 |     report_md = dspy.InputField()
159 |     suggestions = dspy.InputField()
160 |     improved_md = dspy.OutputField()
161 | 
162 | QUERY_GEN      = dspy.ChainOfThought(QueryGenSig)
163 | SUMMARIZER     = dspy.Predict(SummarizeSig)
164 | WRITE_SECTION  = dspy.ChainOfThought(WriteSectionSig)
165 | GAP_ANALYZER   = dspy.Predict(GapAnalysisSig)
166 | CITE_FIXER     = dspy.Predict(CiteFixSig)
167 | REVIEWER       = dspy.Predict(ReviewSig)
168 | REVISER        = dspy.ChainOfThought(ReviseSig)
169 | 
170 | # ----------------------------
171 | # GEPA: fast optimization
172 | # ----------------------------
173 | 
174 | def heuristic_report_metric(gold, pred, trace=None, pred_name=None, pred_trace=None) -> float:
175 |     """LLM-free shaping signal for GEPA."""
176 |     text = ""
177 |     if hasattr(pred, "output_markdown"): text = pred.output_markdown or ""
178 |     elif hasattr(pred, "fixed_markdown"): text = pred.fixed_markdown or ""
179 |     elif hasattr(pred, "queries_json"): text = pred.queries_json or ""
180 |     elif hasattr(pred, "output_json"): text = pred.output_json or ""
181 |     score, notes = 0.0, []
182 |     if hasattr(pred, "queries_json"):
183 |         data = safe_json_loads(text, [])
184 |         uniq = len(set([q.strip().lower() for q in data if isinstance(q, str)]))
185 |         has_ops = any(("site:" in (q or "").lower() or "intitle:" in (q or "").lower() or '"' in (q or "")) for q in data if isinstance(q, str))
186 |         score = 0.3*clamp(uniq/8) + 0.2*(1 if 4 <= uniq <= 10 else 0) + 0.5*(1 if has_ops else 0)
187 |         if uniq < 4: notes.append("Add 6–8 diverse queries.")
188 |         if not has_ops: notes.append("Use operators like site:, intitle:, \"quoted\".")
189 |     elif hasattr(pred, "output_markdown"):
190 |         has_h1 = bool(re.search(r"^#\s+", text, flags=re.M))
191 |         cites = len(re.findall(r"\[\d+\]", text))
192 |         urls_inline = bool(re.search(r"https?://", text))
193 |         too_short = len(text) < 700
194 |         score = (0.25*(1 if has_h1 else 0) + 0.35*clamp(cites/6) + 0.15*(0 if urls_inline else 1) + 0.25*(0 if too_short else 1))
195 |         if not has_h1: notes.append("Start with H1.")
196 |         if cites < 3: notes.append("Add more [n] citations.")
197 |         if urls_inline: notes.append("No bare URLs in body.")
198 |         if too_short: notes.append("Increase depth (>=700 chars).")
199 |     elif hasattr(pred, "fixed_markdown"):
200 |         leftovers = bool(re.search(r"\[S\d+\]", text))
201 |         bracket_nums = bool(re.search(r"\[\d+\]", text))
202 |         score = 0.6*(1 if bracket_nums else 0) + 0.4*(0 if leftovers else 1)
203 |         if leftovers: notes.append("Replace [S#] with [n].")
204 |     elif hasattr(pred, "output_json"):
205 |         ok = safe_json_loads(text) is not None
206 |         score = 1.0 if ok else 0.2
207 |         if not ok: notes.append("Return valid JSON.")
208 |     else:
209 |         score = 0.5; notes.append("Improve structure.")
210 |     return float(clamp(score, 0, 1))
211 | 
212 | def optimize_with_gepa() -> None:
213 |     """Optimize each DSPy module with its specific training set."""
214 |     # Map each module to its training set function
215 |     module_trainsets = {
216 |         QUERY_GEN: trainset_query_gen(),
217 |         SUMMARIZER: trainset_summarizer(),
218 |         WRITE_SECTION: trainset_write_section(),
219 |         GAP_ANALYZER: trainset_gap_analyzer(),
220 |         CITE_FIXER: trainset_cite_fixer(),
221 |         REVIEWER: trainset_reviewer(),
222 |         REVISER: trainset_reviser(),
223 |     }
224 | 
225 |     tele = GEPA(metric=heuristic_report_metric, auto="light", reflection_lm=REFLECT_LM, track_stats=False)
226 | 
227 |     for module, trainset in module_trainsets.items():
228 |         module_name = type(module).__name__
229 |         print(f"[GEPA] Optimizing {module_name} with {len(trainset)} examples...")
230 |         try:
231 |             tele.compile(student=module, trainset=trainset)
232 |             print(f"[GEPA] ✓ {module_name} optimization complete")
233 |         except Exception as e:
234 |             print(f"[GEPA] ✗ Skipped {module_name}: {e}")
235 | 
236 | # Module-specific training sets for GEPA
237 | 
238 | def trainset_query_gen() -> List[dspy.Example]:
239 |     """Training examples for QUERY_GEN: diverse search queries with operators."""
240 |     return [
241 |         dspy.Example(
242 |             section_title="Market Size",
243 |             section_instructions="TAM, SAM, SOM 2019–2025 with figures and growth rates",
244 |             queries_json='["market size TAM 2024", "site:.gov industry market size", "intitle:forecast 2024..2025", "\\"total addressable market\\" 2024", "SAM SOM market sizing report", "site:statista.com market size trends"]'
245 |         ).with_inputs('section_title', 'section_instructions'),
246 |         dspy.Example(
247 |             section_title="Technology Stack",
248 |             section_instructions="Current tools, frameworks, and infrastructure with version numbers",
249 |             queries_json='["site:github.com popular frameworks 2024", "intitle:\\"tech stack\\" comparison", "\\"technology adoption\\" trends 2024", "infrastructure tools benchmarks", "site:stackoverflow.com framework usage statistics", "developer survey 2024 tools"]'
250 |         ).with_inputs('section_title', 'section_instructions'),
251 |         dspy.Example(
252 |             section_title="Regulatory Landscape",
253 |             section_instructions="Key regulations, compliance requirements, and policy changes 2023-2025",
254 |             queries_json='["site:.gov regulations 2024", "compliance requirements industry", "intitle:\\"policy changes\\" 2024..2025", "\\"regulatory framework\\" updates", "site:.org legal compliance guidelines", "data protection regulations 2024"]'
255 |         ).with_inputs('section_title', 'section_instructions'),
256 |     ]
257 | 
258 | def trainset_summarizer() -> List[dspy.Example]:
259 |     """Training examples for SUMMARIZER: source texts to cited bullets."""
260 |     return [
261 |         dspy.Example(
262 |             prompt="Summarize for section 'Market Growth'. Cite using [S#] matching the source indices above.",
263 |             sources_digest="S1 | Market Report 2024 — example.com\nThe global market grew 15% YoY in Q3 2024, reaching $500B valuation.\n\nS2 | Industry Analysis — research.org\nAdoption rates increased from 23% in 2023 to 34% in 2024, driven by enterprise demand.",
264 |             output_json='{"bullets": ["Global market grew 15% YoY in Q3 2024, reaching $500B valuation [S1]", "Adoption rates increased from 23% (2023) to 34% (2024), driven by enterprise demand [S2]"]}'
265 |         ).with_inputs('prompt', 'sources_digest'),
266 |         dspy.Example(
267 |             prompt="Summarize for section 'Key Risks'. Cite using [S#] matching the source indices above.",
268 |             sources_digest="S1 | Risk Assessment Report — analyst.com\nSupply chain disruptions pose the highest risk, with 67% of companies reporting delays.\n\nS2 | Security Analysis — security.org\nCybersecurity incidents increased 45% in H1 2024 compared to 2023.",
269 |             output_json='{"bullets": ["Supply chain disruptions pose highest risk, with 67% of companies reporting delays [S1]", "Cybersecurity incidents increased 45% in H1 2024 vs 2023 [S2]"]}'
270 |         ).with_inputs('prompt', 'sources_digest'),
271 |     ]
272 | 
273 | def trainset_write_section() -> List[dspy.Example]:
274 |     """Training examples for WRITE_SECTION: evidence to markdown with citations."""
275 |     return [
276 |         dspy.Example(
277 |             section_title="Market Trends",
278 |             section_instructions="2024-2025 trends with specific data points and citations",
279 |             evidence_digest="Evidence for 'Market Trends':\n- Cloud adoption growing 25% annually [1]\n- AI integration became standard in 78% of new products [2]\n- Mobile-first approach now 89% of development strategies [3]",
280 |             output_markdown='# Market Trends\n\nThe market has experienced significant transformation in 2024-2025. Cloud adoption continues to accelerate at 25% annual growth [1], fundamentally changing infrastructure strategies across enterprises. Artificial intelligence has moved from experimental to essential, with 78% of new products now integrating AI capabilities as a standard feature [2].\n\nDevelopment approaches have also shifted dramatically toward mobile-first strategies, with 89% of teams now prioritizing mobile experiences in their product roadmaps [3]. This represents a maturation of mobile technology from an afterthought to the primary delivery platform for most digital services.'
281 |         ).with_inputs('section_title', 'section_instructions', 'evidence_digest'),
282 |         dspy.Example(
283 |             section_title="Competitive Landscape",
284 |             section_instructions="Top 5 players with market share and differentiators",
285 |             evidence_digest="Evidence for 'Competitive Landscape':\n- Company A leads with 32% market share, strong in enterprise [1]\n- Company B holds 24%, focused on SMB segment [2]\n- Company C at 18%, competing on price and features [3]\n- Companies D and E combined hold 15%, niche players [4]",
286 |             output_markdown='# Competitive Landscape\n\nThe market is led by Company A with a commanding 32% market share, leveraging strong enterprise relationships and established trust [1]. Company B follows with 24% market share, having carved out a successful position in the SMB segment through targeted product features and pricing [2].\n\nCompany C holds 18% of the market, competing aggressively on both price and feature completeness [3]. The remaining market is fragmented among smaller players, with Companies D and E combining for 15% share, each serving specific niche segments [4].'
287 |         ).with_inputs('section_title', 'section_instructions', 'evidence_digest'),
288 |     ]
289 | 
290 | def trainset_gap_analyzer() -> List[dspy.Example]:
291 |     """Training examples for GAP_ANALYZER: decide if more research needed."""
292 |     return [
293 |         dspy.Example(
294 |             section_title="Market Size",
295 |             bullets_digest="- Market size $500B in 2024\n- Growing 15% YoY\n- North America 45% share\n- Asia Pacific 30% share\n- Europe 20% share\n- Forecast to reach $850B by 2027",
296 |             output_json='{"need_more": false, "followup_queries": []}'
297 |         ).with_inputs('section_title', 'bullets_digest'),
298 |         dspy.Example(
299 |             section_title="Technology Stack",
300 |             bullets_digest="- React most popular framework\n- Python growing in backend",
301 |             output_json='{"need_more": true, "followup_queries": ["infrastructure technologies 2024", "database trends and adoption", "cloud platform market share", "DevOps tool usage statistics"]}'
302 |         ).with_inputs('section_title', 'bullets_digest'),
303 |         dspy.Example(
304 |             section_title="Risks",
305 |             bullets_digest="No bullets yet.",
306 |             output_json='{"need_more": true, "followup_queries": ["industry risks 2024", "compliance challenges", "supply chain vulnerabilities", "cybersecurity threats report", "market disruption factors"]}'
307 |         ).with_inputs('section_title', 'bullets_digest'),
308 |     ]
309 | 
310 | def trainset_cite_fixer() -> List[dspy.Example]:
311 |     """Training examples for CITE_FIXER: convert [S#] to [n]."""
312 |     return [
313 |         dspy.Example(
314 |             markdown_body="# Findings\n\nResearch shows significant growth [S1] with adoption increasing [S2]. The trend continues [S1] into 2025.",
315 |             id_map_notes="1 -> https://example.com/report\n2 -> https://research.org/study",
316 |             fixed_markdown="# Findings\n\nResearch shows significant growth [1] with adoption increasing [2]. The trend continues [1] into 2025."
317 |         ).with_inputs('markdown_body', 'id_map_notes'),
318 |         dspy.Example(
319 |             markdown_body="# Market Analysis\n\nThe market leader [S3] holds 40% share, while challenger [S1] has 25%. Recent data [S2] confirms this distribution [S3].",
320 |             id_map_notes="1 -> https://news.com/article\n2 -> https://data.org/stats\n3 -> https://market.com/report",
321 |             fixed_markdown="# Market Analysis\n\nThe market leader [3] holds 40% share, while challenger [1] has 25%. Recent data [2] confirms this distribution [3]."
322 |         ).with_inputs('markdown_body', 'id_map_notes'),
323 |     ]
324 | 
325 | def trainset_reviewer() -> List[dspy.Example]:
326 |     """Training examples for REVIEWER: review reports for quality."""
327 |     return [
328 |         dspy.Example(
329 |             report_md="# Executive Summary\n\nThis report analyzes market trends based on comprehensive research [1][2][3].\n\n# Market Size\n\nThe global market reached $500B in 2024, growing 15% YoY [1]. Regional distribution shows North America at 45%, Asia Pacific 30%, and Europe 20% [2].\n\n## References\n[1] Market Report 2024 — example.com. https://example.com/report\n[2] Regional Analysis — research.org. https://research.org/study\n[3] Industry Forecast — analyst.com. https://analyst.com/forecast",
330 |             output_json='{"pass_checks": true, "issues": [], "suggestions": ["Consider adding specific date ranges for projections", "Could expand on Asia Pacific growth drivers"], "summary": "Well-structured report with good citation coverage and clear regional breakdown."}'
331 |         ).with_inputs('report_md'),
332 |         dspy.Example(
333 |             report_md="# Market Overview\n\nThe market is growing fast. Many companies are entering the space. https://example.com shows good data.\n\n## References",
334 |             output_json='{"pass_checks": false, "issues": ["No specific data or citations in body", "Bare URL in prose instead of numeric citation", "Empty references section", "Vague language without concrete facts"], "suggestions": ["Add specific growth percentages with citations", "Replace bare URL with [n] citation and add to references", "Include concrete company names and market shares", "Provide time-specific data points"], "summary": "Report lacks specificity, proper citations, and concrete data. Needs major revision."}'
335 |         ).with_inputs('report_md'),
336 |     ]
337 | 
338 | def trainset_reviser() -> List[dspy.Example]:
339 |     """Training examples for REVISER: improve reports based on feedback."""
340 |     return [
341 |         dspy.Example(
342 |             report_md="# Market Size\n\nThe market is big and growing. Many companies participate.\n\n## References\n[1] Market Report — example.com. https://example.com",
343 |             suggestions="- Add specific market size figures with citations\n- Include growth percentages and timeframes\n- Specify key player names",
344 |             improved_md="# Market Size\n\nThe global market reached $500 billion in 2024, representing 15% year-over-year growth [1]. The market is characterized by strong participation from both established enterprises and emerging startups, with the top 5 players accounting for 68% of total market share [1]."
345 |         ).with_inputs('report_md', 'suggestions'),
346 |         dspy.Example(
347 |             report_md="# Technology Trends\n\nAI is popular. Cloud computing is used a lot. https://tech.com has more info.\n\n## References\n[1] Tech Report — tech.com. https://tech.com",
348 |             suggestions="- Replace bare URL with proper citation\n- Add specific adoption percentages\n- Include concrete technology names",
349 |             improved_md="# Technology Trends\n\nArtificial intelligence adoption has reached 78% among enterprise software products in 2024 [1]. Cloud infrastructure utilization continues to grow, with 89% of organizations now operating hybrid or multi-cloud environments [1]. Key technologies driving this transformation include transformer-based language models, containerization platforms like Kubernetes, and serverless computing architectures [1]."
350 |         ).with_inputs('report_md', 'suggestions'),
351 |     ]
352 | 
353 | # ----------------------------
354 | # Exa search + contents
355 | # ----------------------------
356 | 
357 | async def exa_search_and_contents(query: str, k: int) -> List[SourceDoc]:
358 |     """
359 |     Use Exa's search_and_contents to get top-k results with full text.
360 |     Python SDK fields: result.url, result.title, result.published_date, result.text. (See SDK spec.)
361 |     """
362 |     def _call():
363 |         # You can add: category="news"/"research paper", start_published_date=..., type="auto"
364 |         return EXA.search_and_contents(query, text=True, num_results=k)
365 |     try:
366 |         resp = await asyncio.to_thread(_call)
367 |     except Exception as e:
368 |         print(f"[Exa] search failed for '{query}': {e}")
369 |         return []
370 |     docs: List[SourceDoc] = []
371 |     for r in getattr(resp, "results", []) or []:
372 |         text = (getattr(r, "text", None) or "")[:MAX_CONTENT_CHARS_PER_SOURCE]
373 |         if not text or len(text) < 200:
374 |             continue
375 |         url = getattr(r, "url", None) or getattr(r, "id", None)
376 |         title = getattr(r, "title", None)
377 |         pub = getattr(r, "published_date", None)
378 |         # Normalize published to ISO date if possible
379 |         if pub:
380 |             try: pub = dtparse.parse(pub).date().isoformat()
381 |             except Exception: pass
382 |         docs.append(SourceDoc(url=url, title=title, site=short_host(url), published=pub, content=text))
383 |     return docs
384 | 
385 | # ----------------------------
386 | # Citation registry
387 | # ----------------------------
388 | 
389 | class CitationRegistry:
390 |     def __init__(self): self.url_to_id: Dict[str, int] = {}; self.ordered: List[str] = []
391 |     def assign(self, url: str) -> int:
392 |         if url not in self.url_to_id:
393 |             self.url_to_id[url] = len(self.ordered) + 1
394 |             self.ordered.append(url)
395 |         return self.url_to_id[url]
396 |     def references_markdown(self, url_to_doc: Dict[str, SourceDoc]) -> str:
397 |         lines = ["## References"]
398 |         for u in self.ordered:
399 |             idx = self.url_to_id[u]; doc = url_to_doc.get(u) or SourceDoc(url=u)
400 |             label = doc.title or u; site = f" — {doc.site}" if doc.site else ""
401 |             dt = f" (published {doc.published})" if doc.published else ""
402 |             lines.append(f"[{idx}] {label}{site}{dt}. {u}")
403 |         return "\n".join(lines)
404 | 
405 | # ----------------------------
406 | # Graph state
407 | # ----------------------------
408 | 
409 | class GraphState(TypedDict):
410 |     topic: str
411 |     sections: List[SectionSpec]
412 |     round: int
413 |     queries: Annotated[List[Dict[str, str]], operator.add]            # [{"section","query"}]
414 |     research: Annotated[List[ResearchSummary], operator.add]          # append
415 |     drafts: Annotated[Dict[str, str], operator.or_]                   # {section: markdown}
416 |     cite_maps: Annotated[Dict[str, Dict[int, str]], operator.or_]     # {section: {local_num: url}}
417 |     used_urls: Annotated[List[str], operator.add]                     # optional
418 |     report_md: Optional[str]
419 |     references_md: Optional[str]
420 |     eval_result: Optional[EvalResult]
421 | 
422 | # ----------------------------
423 | # Nodes (agents)
424 | # ----------------------------
425 | 
426 | def plan_queries(state: GraphState) -> GraphState:
427 |     print("\n" + "="*80)
428 |     print(f"[QUERY] Planning search queries for {len(state['sections'])} sections...")
429 |     print("="*80)
430 | 
431 |     new_queries: List[Dict[str, str]] = []
432 |     with dspy.context(lm=WRITER_LM):
433 |         for sec in state["sections"]:
434 |             q = QUERY_GEN(section_title=sec.name, section_instructions=sec.instructions)
435 |             data = safe_json_loads(q.queries_json, [])
436 |             uniq, seen = [], set()
437 |             for s in data:
438 |                 if isinstance(s, str):
439 |                     s2 = s.strip()
440 |                     if s2 and s2.lower() not in seen:
441 |                         uniq.append(s2); seen.add(s2.lower())
442 |                     if len(uniq) >= 8: break
443 |             if not uniq:
444 |                 uniq = [f'{sec.name} overview', f'{sec.name} trends 2024..2025', f'"{sec.name}" case studies', f'intitle:{sec.name} report PDF', f'site:.gov {sec.name}', f'site:.org {sec.name}']
445 |             print(f"[QUERY] {sec.name}: {len(uniq)} queries generated")
446 |             for u in uniq[:8]:
447 |                 new_queries.append({"section": sec.name, "query": u})
448 | 
449 |     print(f"[QUERY] Total queries planned: {len(new_queries)}\n")
450 |     return {"queries": new_queries}
451 | 
452 | def route_queries(state: GraphState):
453 |     return [Send("search_node", {"section": item["section"], "query": item["query"]})
454 |             for item in state.get("queries", [])]
455 | 
456 | async def search_node(state: GraphState) -> GraphState:
457 |     """Exa search + contents + summarize (Flash)."""
458 |     section, query = state["section"], state["query"]
459 |     print(f"[SEARCH] '{section}': {query[:60]}{'...' if len(query) > 60 else ''}")
460 | 
461 |     docs = await exa_search_and_contents(query, k=SEARCH_RESULTS_PER_QUERY)
462 |     print(f"[SEARCH] → Found {len(docs)} documents")
463 | 
464 |     # Build digest for LLM summarization: S1,S2,... per query
465 |     pieces = []
466 |     for i, d in enumerate(docs, start=1):
467 |         excerpt = (d.content or "")[:2000]
468 |         pieces.append(f"S{i} | {d.title or d.url} — {d.site or ''}\n{excerpt}")
469 |     sources_digest = "\n\n".join(pieces) if pieces else "NO_SOURCES"
470 |     prompt = f"Summarize for section '{section}'. Cite using [S#] matching the source indices above."
471 | 
472 |     with dspy.context(lm=RESEARCH_LM):
473 |         out = SUMMARIZER(prompt=prompt, sources_digest=sources_digest)
474 |     js = safe_json_loads(out.output_json, {}) or {}
475 |     bullets = js.get("bullets", [])
476 | 
477 |     print(f"[SEARCH] → Extracted {len(bullets)} evidence bullets\n")
478 |     return {"research": [ResearchSummary(section=section, query=query, bullets=bullets, sources=docs)]}
479 | 
480 | def merge_and_gap_analyze(state: GraphState) -> GraphState:
481 |     print("\n" + "="*80)
482 |     print(f"[GAP] Analyzing research coverage (Round {state['round'] + 1}/{MAX_ROUNDS})...")
483 |     print("="*80)
484 | 
485 |     sec_to_bullets: Dict[str, List[str]] = {}
486 |     for rs in state.get("research", []):
487 |         sec_to_bullets.setdefault(rs.section, []).extend(rs.bullets or [])
488 | 
489 |     followups: List[Dict[str, str]] = []
490 |     with dspy.context(lm=WRITER_LM):
491 |         for sec in state["sections"]:
492 |             bullets = sec_to_bullets.get(sec.name, [])
493 |             digest = "\n".join(f"- {b}" for b in bullets[:50]) if bullets else "No bullets yet."
494 |             print(f"[GAP] '{sec.name}': {len(bullets)} bullets collected")
495 | 
496 |             ga = GAP_ANALYZER(section_title=sec.name, bullets_digest=digest)
497 |             j = safe_json_loads(ga.output_json, {}) or {}
498 |             if j.get("need_more") and isinstance(j.get("followup_queries"), list):
499 |                 new_queries = [q for q in j["followup_queries"][:5] if isinstance(q, str) and q.strip()]
500 |                 if new_queries:
501 |                     print(f"[GAP] → Needs {len(new_queries)} more queries")
502 |                     for q in new_queries:
503 |                         followups.append({"section": sec.name, "query": q.strip()})
504 | 
505 |     if followups and state["round"] + 1 < MAX_ROUNDS:
506 |         print(f"\n[GAP] DECISION: Continue research with {len(followups)} followup queries\n")
507 |         return {"queries": followups, "round": state["round"] + 1}
508 | 
509 |     print("\n[GAP] DECISION: Research complete, proceeding to writing\n")
510 |     return {"queries": []}  # clear queries and proceed to writing
511 | 
512 | def route_or_write(state: GraphState):
513 |     if any(state.get("queries", [])) and state["round"] > 0:
514 |         return route_queries(state)   # more research
515 |     # else: write each section in parallel
516 |     return [Send("write_section_node", {
517 |         "section": s.name,
518 |         "sections": state["sections"],
519 |         "research": state.get("research", [])
520 |     }) for s in state["sections"]]
521 | 
522 | def _build_evidence_digest(section: str, research: List[ResearchSummary]) -> Tuple[str, Dict[int, str]]:
523 |     """
524 |     Merge bullets across queries and create a local S#->url mapping for the writer pass.
525 |     We number S1..Sk per research summary (query) and rewrite bullets remains [S#] (writer learns mapping).
526 |     """
527 |     lines = [f"Evidence for '{section}':"]
528 |     s_to_url_global: Dict[int, str] = {}
529 |     next_num = 1
530 |     for rs in research:
531 |         if rs.section != section: continue
532 |         # map local S# for this query block to absolute local numbers for the section
533 |         local_map = {}
534 |         for d in rs.sources:
535 |             local_map[f"S{len(local_map)+1}"] = d.url
536 |         for b in rs.bullets:
537 |             bb = b
538 |             for s_id, url in local_map.items():
539 |                 # assign a stable number for this section for each url
540 |                 if url not in s_to_url_global.values():
541 |                     s_to_url_global[next_num] = url; assigned = next_num; next_num += 1
542 |                 else:
543 |                     # find existing number for this url
544 |                     assigned = [k for k,v in s_to_url_global.items() if v == url][0]
545 |                 bb = re.sub(rf"\[{s_id}\]", f"[{assigned}]", bb)
546 |             lines.append(f"- {bb}")
547 |     return "\n".join(lines), s_to_url_global  # evidence digest, local map num->url
548 | 
549 | def write_section_node(state: GraphState) -> GraphState:
550 |     section = state["section"]
551 |     sec_spec = next((s for s in state["sections"] if s.name == section), None)
552 |     if not sec_spec: return {}
553 | 
554 |     print(f"[WRITE] Drafting section: '{section}'")
555 | 
556 |     edigest, local_num_to_url = _build_evidence_digest(section, state.get("research", []))
557 | 
558 |     with dspy.context(lm=WRITER_LM):
559 |         w = WRITE_SECTION(section_title=sec_spec.name, section_instructions=sec_spec.instructions, evidence_digest=edigest)
560 |         md = w.output_markdown or f"# {section}\n\n*(No content was generated.)*\n"
561 | 
562 |     with dspy.context(lm=WRITER_LM):
563 |         fixed = CITE_FIXER(markdown_body=md, id_map_notes="\n".join(f"{k} -> {v}" for k, v in local_num_to_url.items()))
564 |         md2 = fixed.fixed_markdown or md
565 | 
566 |     used_ids = sorted(set(int(x) for x in re.findall(r"\[(\d+)\]", md2)))
567 |     urls = [local_num_to_url.get(i) for i in used_ids if i in local_num_to_url]
568 | 
569 |     char_count = len(md2)
570 |     cite_count = len(used_ids)
571 |     print(f"[WRITE] → {char_count} chars, {cite_count} citations\n")
572 | 
573 |     # store section draft, local citation map (for final global renumber), and used urls
574 |     return {
575 |         "drafts": {section: md2},
576 |         "cite_maps": {section: local_num_to_url},
577 |         "used_urls": [u for u in urls if u]
578 |     }
579 | 
580 | def assemble_and_review(state: GraphState) -> GraphState:
581 |     print("\n" + "="*80)
582 |     print("[REVIEW] Assembling and reviewing final report...")
583 |     print("="*80)
584 | 
585 |     order = [s.name for s in state["sections"]]
586 |     global_reg = CitationRegistry()
587 |     url_to_doc: Dict[str, SourceDoc] = {}
588 | 
589 |     # Build a metadata table for all discovered sources
590 |     for rs in state.get("research", []):
591 |         for d in rs.sources:
592 |             url_to_doc[d.url] = d
593 | 
594 |     print(f"[REVIEW] Assembling {len(order)} sections...")
595 | 
596 |     # Renumber citations globally in order of their first appearance across sections
597 |     def renumber_section(md: str, map_local: Dict[int, str]) -> str:
598 |         def _repl(m):
599 |             old_num = int(m.group(1))
600 |             url = map_local.get(old_num)
601 |             if not url: return m.group(0)
602 |             new_num = global_reg.assign(url)
603 |             return f"[{new_num}]"
604 |         # only replace inside the body (no references exist yet)
605 |         return re.sub(r"\[(\d+)\]", _repl, md)
606 | 
607 |     parts = []
608 |     for sec in order:
609 |         body = state["drafts"].get(sec, "")
610 |         local_map = state.get("cite_maps", {}).get(sec, {})
611 |         parts.append(renumber_section(body, local_map))
612 |     body_renumbered = "\n\n".join(parts).strip()
613 | 
614 |     refs = global_reg.references_markdown(url_to_doc)
615 |     full_md = f"{body_renumbered}\n\n{refs}"
616 | 
617 |     total_citations = len(global_reg.ordered)
618 |     total_chars = len(full_md)
619 |     print(f"[REVIEW] → {total_chars} chars, {total_citations} unique sources")
620 | 
621 |     # Review & optional revise
622 |     print("[REVIEW] Running quality review...")
623 |     with dspy.context(lm=WRITER_LM):
624 |         rv = REVIEWER(report_md=full_md)
625 |     rj = safe_json_loads(rv.output_json, {}) or {}
626 | 
627 |     pass_checks = rj.get("pass_checks", False)
628 |     issues = rj.get("issues", [])
629 |     suggestions = rj.get("suggestions", [])
630 | 
631 |     print(f"[REVIEW] → Pass: {pass_checks}, Issues: {len(issues)}, Suggestions: {len(suggestions)}")
632 | 
633 |     if not pass_checks and suggestions:
634 |         print(f"[REVIEW] Applying {len(suggestions)} revision suggestions...")
635 |         with dspy.context(lm=WRITER_LM):
636 |             rev = REVISER(report_md=full_md, suggestions="\n".join(f"- {s}" for s in suggestions))
637 |         full_md = (rev.improved_md or body_renumbered).strip() + "\n\n" + refs
638 |         print("[REVIEW] → Revision complete")
639 | 
640 |     print()
641 |     return {"report_md": full_md, "references_md": refs}
642 | 
643 | # ----------------------------
644 | # Evaluation
645 | # ----------------------------
646 | 
647 | DEFAULT_EVAL_QUESTIONS = [
648 |     "Does each section follow the instructions and include concrete facts?",
649 |     "Are all nontrivial claims cited with [n] and do references look reputable?",
650 |     "Is the structure clear with helpful headings/subheadings?",
651 |     "Are there explicit dates for time-sensitive facts?",
652 |     "Are there at least 2–3 sources per major section?",
653 |     "Are URLs omitted from the prose (only numeric citations)?",
654 |     "Is there any hallucination smell?",
655 | ]
656 | 
657 | def eval_report_simple(md: str) -> EvalResult:
658 |     checks = {}
659 |     checks["has_h1"] = 1.0 if re.search(r"^#\s+", md, flags=re.M) else 0.0
660 |     cites = len(re.findall(r"\[\d+\]", md))
661 |     checks["enough_cites"] = clamp(cites/10)
662 |     checks["no_raw_urls"] = 1.0 if not re.search(r"https?://", md.split("## References")[0]) else 0.0
663 |     checks["has_refs"] = 1.0 if "## References" in md else 0.0
664 |     checks["length_ok"] = 1.0 if len(md) >= 2000 else 0.4 if len(md) >= 1200 else 0.1
665 |     score = sum(checks.values())/len(checks)
666 |     return EvalResult(score=score, breakdown=checks, notes=f"{cites} inline citations; {len(md)} chars.")
667 | 
668 | # ----------------------------
669 | # Build graph
670 | # ----------------------------
671 | 
672 | def build_graph() -> Any:
673 |     graph = StateGraph(GraphState)
674 |     graph.add_node("plan_queries", plan_queries)
675 |     graph.add_node("search_node", search_node)  # async
676 |     graph.add_node("merge_and_gap_analyze", merge_and_gap_analyze)
677 |     graph.add_node("write_section_node", write_section_node)
678 |     graph.add_node("assemble_and_review", assemble_and_review)
679 | 
680 |     graph.add_edge(START, "plan_queries")
681 |     graph.add_conditional_edges("plan_queries", route_queries, ["search_node"])
682 |     graph.add_edge("search_node", "merge_and_gap_analyze")
683 |     graph.add_conditional_edges("merge_and_gap_analyze", route_or_write, ["search_node", "write_section_node"])
684 |     graph.add_edge("write_section_node", "assemble_and_review")
685 |     graph.add_edge("assemble_and_review", END)
686 |     return graph.compile()
687 | 
688 | # ----------------------------
689 | # Run end-to-end
690 | # ----------------------------
691 | 
692 | async def run_pipeline(topic: str, sections: List[SectionSpec], optimization: bool = False) -> Dict[str, Any]:
693 |     print("\n" + "="*80)
694 |     print(f"RESEARCH PIPELINE: {topic}")
695 |     print("="*80)
696 |     print(f"Sections: {', '.join([s.name for s in sections])}")
697 |     print("="*80 + "\n")
698 | 
699 |     # GEPA optimization with module-specific training sets
700 |     if optimization:
701 |         print("[GEPA] Starting prompt optimization...")
702 |         optimize_with_gepa()
703 |         print("[GEPA] Optimization complete!\n")
704 | 
705 |     print("[PIPELINE] Building research graph...")
706 |     app = build_graph()
707 |     initial_state: GraphState = {
708 |         "topic": topic,
709 |         "sections": sections,
710 |         "round": 0,
711 |         "queries": [],
712 |         "research": [],
713 |         "drafts": {},
714 |         "cite_maps": {},
715 |         "used_urls": [],
716 |         "report_md": None,
717 |         "references_md": None,
718 |         "eval_result": None,
719 |     }
720 | 
721 |     print("[PIPELINE] Executing multi-agent research workflow...\n")
722 |     final_state: GraphState = await app.ainvoke(initial_state)
723 | 
724 |     print("\n" + "="*80)
725 |     print("[PIPELINE] Evaluating report quality...")
726 |     print("="*80)
727 | 
728 |     # Evaluate
729 |     md = final_state.get("report_md") or ""
730 |     final_state["eval_result"] = eval_report_simple(md)
731 | 
732 |     # Save
733 |     print("[PIPELINE] Saving report to ./report.md")
734 |     with open("report.md", "w", encoding="utf-8") as f:
735 |         f.write(md)
736 | 
737 |     print("[PIPELINE] ✓ Pipeline complete!\n")
738 |     return final_state
739 | 
740 | # ----------------------------
741 | # Example usage
742 | # ----------------------------
743 | 
744 | SECTIONS = [
745 |     SectionSpec(
746 |         name="Executive Summary",
747 |         instructions="In 180–250 words, summarize the most decision-relevant takeaways. No citations here unless needed for key numbers."
748 |     ),
749 |     SectionSpec(
750 |         name="Market Landscape",
751 |         instructions="Define the space; 2023–2025 trends; include 4+ specific figures with sources."
752 |     ),
753 |     SectionSpec(
754 |         name="Key Players & Differentiation",
755 |         instructions="Compare 5–7 players; list 1–2 distinctive capabilities each; add 2–3 objective benchmarks with citations."
756 |     ),
757 |     SectionSpec(
758 |         name="Risks & Open Questions",
759 |         instructions="Top risks, unknowns, and watch items; cite evidence; use bullets."
760 |     ),
761 |     SectionSpec(
762 |         name="Outlook (12–24 months)",
763 |         instructions="3–5 grounded predictions with supporting evidence and explicit dates; include leading indicators to track."
764 |     ),
765 | ]
766 | 
767 | if __name__ == "__main__":
768 |     topic = "State of Edge AI Acceleration (2024–2025)"
769 |     try:
770 |         final = asyncio.run(run_pipeline(
771 |             topic=topic, 
772 |             sections=SECTIONS, 
773 |             # optimization=True         # Uncomment to enable GEPA optimization
774 |         ))
775 |     except Exception as e:
776 |         print("Pipeline failed:", e)
777 |         raise
778 | 
779 |     print("\n" + "="*88)
780 |     print("FINAL MARKDOWN (also saved to ./report.md):")
781 |     print("="*88 + "\n")
782 |     print(final.get("report_md", ""))
783 | 
784 |     ev: EvalResult = final.get("eval_result") or EvalResult(score=0.0, breakdown={}, notes="")
785 |     print("\n" + "-"*88)
786 |     print("EVALUATION (quick heuristic):")
787 |     print("-"*88)
788 |     print(f"Score: {ev.score:.2f}")
789 |     print("Breakdown:", json.dumps(ev.breakdown, indent=2))
790 |     print("Notes:", ev.notes)
791 | 


--------------------------------------------------------------------------------