├── dspy-gepa-sql-generator ├── pyproject.toml ├── README.md └── nl2sql_gepa.py ├── dspy-fact-checker ├── pyproject.toml ├── LICENSE ├── README.md └── fact_check_rag.py ├── dspy-gepa-deidentification ├── pyproject.toml ├── LICENSE ├── README.md └── minimal_gepa_deid.py ├── dspy-gepa-researcher ├── pyproject.toml ├── LICENSE ├── README.md ├── report.md └── dspy_gepa_researcher.py ├── .gitignore └── README.md /dspy-gepa-sql-generator/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dspy-gepa-sql-generator" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "dspy-ai>=3.0.3", 9 | ] 10 | -------------------------------------------------------------------------------- /dspy-fact-checker/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dspy-fact-checker" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "dspy-ai>=3.0.3", 9 | "ipykernel>=7.0.1", 10 | "wikipedia>=1.4.0", 11 | ] 12 | -------------------------------------------------------------------------------- /dspy-gepa-deidentification/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dspy-gepa-deidentification" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "dspy-ai>=3.0.3", 9 | "gepa>=0.0.7", 10 | "ipykernel>=7.0.1", 11 | ] 12 | -------------------------------------------------------------------------------- /dspy-gepa-researcher/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dspy-gepa-researcher" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "dspy>=3.0.3", 9 | "exa-py>=1.16.1", 10 | "gepa>=0.0.7", 11 | "ipykernel>=7.0.1", 12 | "langgraph>=1.0.1", 13 | "litellm>=1.78.7", 14 | "pydantic>=2.12.3", 15 | "python-dateutil>=2.9.0.post0", 16 | ] 17 | -------------------------------------------------------------------------------- /dspy-fact-checker/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dspy-gepa-researcher/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dspy-gepa-deidentification/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv/ 11 | venv/ 12 | env/ 13 | ENV/ 14 | .virtualenv 15 | 16 | # Environment variables and secrets 17 | .env 18 | .env.* 19 | .env.local 20 | .env.*.local 21 | *.key 22 | *.pem 23 | secrets.yaml 24 | secrets.json 25 | credentials.json 26 | .secrets/ 27 | config.local.* 28 | 29 | # IDE and editor files 30 | .vscode/ 31 | .idea/ 32 | *.swp 33 | *.swo 34 | *~ 35 | .DS_Store 36 | *.sublime-project 37 | *.sublime-workspace 38 | 39 | # OS-specific files 40 | .DS_Store 41 | Thumbs.db 42 | Desktop.ini 43 | 44 | # Python testing and coverage 45 | .pytest_cache/ 46 | .coverage 47 | .coverage.* 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | 55 | # Type checking 56 | .mypy_cache/ 57 | .dmypy.json 58 | dmypy.json 59 | .pyre/ 60 | .pytype/ 61 | 62 | # Jupyter Notebook 63 | .ipynb_checkpoints 64 | *.ipynb_checkpoints/ 65 | 66 | # Python distribution 67 | *.egg 68 | *.egg-info/ 69 | dist/ 70 | build/ 71 | eggs/ 72 | .eggs/ 73 | lib/ 74 | lib64/ 75 | parts/ 76 | sdist/ 77 | var/ 78 | wheels/ 79 | pip-wheel-metadata/ 80 | share/python-wheels/ 81 | 82 | # Logs and databases 83 | *.log 84 | *.sql 85 | *.sqlite 86 | *.db 87 | 88 | # uv specific 89 | .python-version 90 | uv.lock 91 | 92 | # DSPy specific 93 | dspy_cache/ 94 | .dspy_cache/ 95 | compiled_programs/ 96 | *.dspy 97 | dspy_*.json 98 | 99 | # ML/AI artifacts 100 | models/ 101 | checkpoints/ 102 | *.pkl 103 | *.pickle 104 | *.joblib 105 | *.h5 106 | *.ckpt 107 | *.safetensors 108 | wandb/ 109 | mlruns/ 110 | experiments/ 111 | outputs/ 112 | 113 | # Data files - IMPORTANT: exclude all data to prevent PII leaks 114 | data/ 115 | *.csv 116 | *.tsv 117 | *.xlsx 118 | *.xls 119 | *.json 120 | *.jsonl 121 | *.parquet 122 | *.arrow 123 | *.feather 124 | raw_data/ 125 | processed_data/ 126 | test_data/ 127 | sample_data/ 128 | 129 | # Exceptions for important config files (override *.json above) 130 | !package.json 131 | !tsconfig.json 132 | !pyproject.toml 133 | 134 | # Exclude sample outputs that might contain generated text 135 | output/ 136 | results/ 137 | reports/*.txt 138 | reports/*.md 139 | 140 | # Temporary and scratch files 141 | scratch/ 142 | tmp/ 143 | temp/ 144 | *.tmp 145 | notes.txt 146 | todo.txt 147 | 148 | # Backup files 149 | *.bak 150 | *.backup 151 | *~.nib 152 | *.orig 153 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DSPy Examples with GEPA 2 | 3 | A collection of practical examples demonstrating how to use [DSPy](https://dspy.ai/). 4 | 5 | ## About 6 | 7 | This repository contains various examples showcasing different applications of DSPy (+ GEPA optimizer). 8 | 9 | ### What is DSPy? 10 | 11 | DSPy is a framework for algorithmically optimizing language model prompts and weights. Instead of manually tweaking prompts, you define what your system should do (the signature), and DSPy figures out how to do it through optimization. 12 | 13 | ### What is GEPA? 14 | 15 | GEPA (Generalized Evolution of Prompting via Adaptation) is a DSPy optimizer that: 16 | - Automatically improves prompts through iterative reflection 17 | - Learns from feedback metrics without requiring labeled data 18 | - Evolves instructions based on performance analysis 19 | - Enables zero-shot learning with just input/output pairs 20 | 21 | ## Examples 22 | 23 | ### 1. [PII De-identification](./dspy-gepa-deidentification/) 24 | 25 | Demonstrates using GEPA to automatically optimize prompts for redacting personally identifiable information (PII) from incident reports. 26 | 27 | **Key concepts:** 28 | - Automatic prompt optimization for sensitive data handling 29 | - Dual metric systems (simple and composite) 30 | - Structured output preservation 31 | - Feedback-driven learning 32 | 33 | [View example →](./dspy-gepa-deidentification/) 34 | 35 | ### 2. [Fact-Checked RAG](./dspy-fact-checker/) 36 | 37 | A self-correcting Retrieval-Augmented Generation system that fact-checks its own answers against Wikipedia sources and automatically refines responses until they are fully supported by evidence. 38 | 39 | **Key concepts:** 40 | - Self-correcting pipeline with dspy.Refine 41 | - Fact verification against retrieved context 42 | - Wikipedia integration for knowledge retrieval 43 | - Automatic retry and refinement 44 | 45 | [View example →](./dspy-fact-checker/) 46 | 47 | ### 3. [Natural Language to SQL](./dspy-gepa-sql-generator/) 48 | 49 | Demonstrates using GEPA to optimize prompts for converting natural language questions into SQL queries, with comprehensive safety and correctness validation. 50 | 51 | **Key concepts:** 52 | - Natural language to SQL generation 53 | - Custom metrics for safety, execution, and correctness 54 | - Database schema understanding 55 | - Query optimization through GEPA 56 | 57 | [View example →](./dspy-gepa-sql-generator/) 58 | 59 | ## Getting Started 60 | 61 | Each example directory contains its own README with: 62 | - Detailed setup instructions 63 | - Usage examples 64 | - Configuration details 65 | - Code explanations 66 | 67 | Navigate to any example directory to get started. 68 | 69 | ## Requirements 70 | 71 | Most examples require: 72 | - Python 3.13+ 73 | - An OpenAI API key (or compatible LLM provider) 74 | - Dependencies managed via [uv](https://github.com/astral-sh/uv) or pip 75 | 76 | ## Contributing 77 | 78 | Additional examples are welcome! If you have a useful DSPy (+ GEPA) example to share: 79 | 1. Create a new directory with a descriptive name 80 | 2. Include a comprehensive README.md 81 | 3. Ensure dependencies are clearly documented 82 | 4. Add your example to this README's Examples section 83 | 5. Submit a pull request 84 | 85 | ## Resources 86 | 87 | - [DSPy Documentation](https://dspy.ai/) 88 | - [GEPA Optimizer Documentation](https://dspy.ai/api/optimizers/GEPA/overview/) 89 | - [DSPy GitHub Repository](https://github.com/stanfordnlp/dspy) 90 | 91 | ## License 92 | 93 | Individual examples may have their own licenses. Please check each example directory for details. 94 | 95 | --- 96 | 97 | **Note**: These are demonstration projects for educational purposes. Always validate and thoroughly test before using in production environments. 98 | -------------------------------------------------------------------------------- /dspy-gepa-deidentification/README.md: -------------------------------------------------------------------------------- 1 | # DSPy GEPA for PII De-identification 2 | 3 | A minimal example demonstrating how to use DSPy's GEPA (Generalized Evolution of Prompting via Adaptation) optimizer to automatically improve PII (Personally Identifiable Information) redaction in incident reports. 4 | 5 | ## Overview 6 | 7 | This project showcases how GEPA can optimize prompts for sensitive data de-identification tasks through reflection-based prompt evolution. The system learns to: 8 | - Redact emails, phone numbers, and names using standard placeholders 9 | - Preserve document structure (headers, bullet points) 10 | - Maintain causal relationships and action items 11 | - Avoid fabricating new information 12 | 13 | ## Features 14 | 15 | - **Automatic Prompt Optimization**: GEPA evolves instructions based on feedback metrics 16 | - **Dual Metric System**: Includes both a simple and composite metric for evaluation 17 | - **Structured Output**: Maintains "Root cause:" and "Action items:" sections with bullets 18 | - **Zero-Shot Learning**: No labeled examples required - just input/output pairs 19 | - **Feedback-Driven**: Rich textual feedback guides the optimization process 20 | 21 | ## Installation 22 | 23 | This project uses [uv](https://github.com/astral-sh/uv) for dependency management. 24 | 25 | ```bash 26 | # Install dependencies with uv 27 | uv sync 28 | 29 | # Or with pip 30 | pip install dspy-ai gepa ipykernel 31 | ``` 32 | 33 | ### Requirements 34 | 35 | - Python 3.13+ 36 | - OpenAI API key (for GPT-4o and GPT-4o-mini) 37 | 38 | ## Configuration 39 | 40 | Create a `.env` file in the project root with your OpenAI API key: 41 | 42 | ```bash 43 | OPENAI_API_KEY=your-api-key-here 44 | ``` 45 | 46 | **Important**: Never commit your `.env` file or API keys to version control. 47 | 48 | ## Usage 49 | 50 | Run the minimal example: 51 | 52 | ```python 53 | uv run minimal_gepa_deid.py 54 | ``` 55 | 56 | The script will: 57 | 1. Define a de-identification signature and module 58 | 2. Configure GEPA with a reflection model 59 | 3. Optimize the module on training examples 60 | 4. Test on a sample incident report 61 | 62 | ### Example Output 63 | 64 | **Input:** 65 | ``` 66 | Root cause: Dave Miller called 650-555-0000 to report breach. 67 | Action items: 68 | - email dave@contoso.com 69 | - notify legal 70 | ``` 71 | 72 | **Output:** 73 | ``` 74 | Root cause: [NAME] called [PHONE] to report breach. 75 | Action items: 76 | - email [EMAIL] 77 | - notify legal 78 | ``` 79 | 80 | ## How It Works 81 | 82 | 1. **Signature Definition**: Specifies what the module should do (not how) 83 | 2. **Module Creation**: Uses `ChainOfThought` for reasoning about redactions 84 | 3. **Metric with Feedback**: Returns both a score and textual guidance 85 | 4. **GEPA Optimization**: Evolves internal instructions through reflection 86 | 5. **Inference**: Apply the optimized module to new reports 87 | 88 | ### Metrics 89 | 90 | **Simple Metric (`pii_metric`)**: 91 | - 60% score for zero PII leaks 92 | - 20% for preserving "Root cause:" header 93 | - 20% for preserving "Action items:" header 94 | 95 | **Composite Metric (`composite_pii_metric`)**: 96 | - Stricter checks including bullet point formatting 97 | - Hallucination detection (no new PII introduction) 98 | - Penalty-based scoring (1.0 - 0.25 � issues) 99 | 100 | ## Important Notes 101 | 102 | ### Data Privacy 103 | - This is a **demonstration project** - do not use with real sensitive data without thorough testing 104 | - Never commit actual incident reports or PII to version control 105 | - All data files (`.csv`, `.json`, etc.) are git-ignored by default 106 | 107 | ### Model Selection 108 | The example uses: 109 | - **Task LM**: `gpt-4o-mini` (faster, cheaper for execution) 110 | - **Reflection LM**: `gpt-4o` (stronger for meta-reasoning about prompts) 111 | 112 | You can adjust these in the code based on your needs and budget. 113 | 114 | ## Development 115 | 116 | ### Running in Jupyter 117 | The script can also be run in Jupyter notebooks. The project includes `ipykernel` for this purpose. 118 | 119 | ### Customization 120 | - Modify regex patterns in `EMAIL`, `PHONE`, `NAME` for your use case 121 | - Adjust scoring weights in the metric functions 122 | - Switch between `pii_metric` and `composite_pii_metric` in the GEPA configuration 123 | - Tune GEPA parameters (currently using `auto="light"` for quick demos) 124 | 125 | ## GEPA Configuration 126 | 127 | ```python 128 | gepa = dspy.GEPA( 129 | metric=pii_metric, 130 | auto="light", # or "medium"/"heavy" for more optimization 131 | reflection_lm=reflect_lm, # Stronger model for reflection 132 | track_stats=True, # Track optimization statistics 133 | track_best_outputs=True # Keep best candidates per input 134 | ) 135 | ``` 136 | 137 | ## References 138 | 139 | - [DSPy Documentation](https://dspy.ai/) 140 | - [GEPA Overview](https://dspy.ai/api/optimizers/GEPA/overview/) 141 | - [DSPy GitHub Repository](https://github.com/stanfordnlp/dspy) 142 | 143 | ## Contributing 144 | 145 | Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change. 146 | 147 | ## License 148 | 149 | MIT License - see [LICENSE](LICENSE) for details. 150 | 151 | ## Acknowledgments 152 | 153 | Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP and [GEPA](https://dspy.ai/api/optimizers/GEPA/) optimizer. 154 | 155 | --- 156 | 157 | **Disclaimer**: This is a demonstration project for educational purposes. Always validate and test thoroughly before using in production environments with sensitive data. 158 | 159 | ## Citation 160 | 161 | If you use this project in your research, please cite: 162 | 163 | ```bibtex 164 | @software{dspy_examples, 165 | title={DSPy Multi-Agent Research Pipeline}, 166 | author={Your Name}, 167 | year={2025}, 168 | url={https://github.com/raja-patnaik/dspy-examples} 169 | } 170 | ``` -------------------------------------------------------------------------------- /dspy-fact-checker/README.md: -------------------------------------------------------------------------------- 1 | # DSPy Fact-Checked RAG 2 | 3 | A self-correcting Retrieval-Augmented Generation (RAG) system built with [DSPy](https://github.com/stanfordnlp/dspy) that fact-checks its own answers against Wikipedia sources. The system automatically refines responses until they are fully supported by retrieved evidence. 4 | 5 | ## Features 6 | 7 | - **Self-Correcting Pipeline**: Uses `dspy.Refine` to iteratively improve answers based on verification feedback 8 | - **Fact Verification**: Built-in verifier that checks if answer claims are supported by retrieved context 9 | - **Wikipedia Integration**: Custom retriever that fetches relevant passages from Wikipedia 10 | - **Automatic Retry**: Continues refining answers until all claims are verifiable (up to max attempts) 11 | - **Source Attribution**: Embeds Wikipedia URLs in context for transparency 12 | 13 | ## How It Works 14 | 15 | 1. **Retrieve**: Fetches relevant Wikipedia passages based on the question 16 | 2. **Generate**: Creates an answer using only information from the retrieved context 17 | 3. **Verify**: Checks if the answer contains any unsupported claims 18 | 4. **Refine**: If verification fails, automatically regenerates with feedback until the answer is fully supported 19 | 20 | The pipeline uses DSPy's `Refine` module with a reward function that scores 1.0 only when the verifier confirms all claims are supported by the context. 21 | 22 | ## Requirements 23 | 24 | - Python 3.13+ 25 | - OpenAI API key (uses `gpt-4o-mini` by default) 26 | 27 | ## Installation 28 | 29 | 1. Change to the project folder after cloning the repo: 30 | ```bash 31 | cd dspy-fact-checker 32 | ``` 33 | 34 | 2. Install dependencies using [uv](https://github.com/astral-sh/uv): 35 | ```bash 36 | uv sync 37 | ``` 38 | 39 | Or with pip: 40 | ```bash 41 | pip install dspy-ai wikipedia 42 | ``` 43 | 44 | 3. Set your OpenAI API key: 45 | ```bash 46 | export OPENAI_API_KEY="your-api-key-here" # Linux/Mac 47 | # OR 48 | set OPENAI_API_KEY=your-api-key-here # Windows 49 | ``` 50 | 51 | ## Usage 52 | 53 | Run the example: 54 | ```bash 55 | uv run fact_check_rag.py 56 | ``` 57 | 58 | ### Example Output 59 | 60 | ``` 61 | Question: When did Apollo 11 land on the Moon, and who were the astronauts involved? 62 | -------------------------------------------------------------------------------- 63 | Final Answer: 64 | Apollo 11 landed on the Moon on July 20, 1969. The astronauts involved were 65 | Neil Armstrong, Buzz Aldrin, and Michael Collins, with Armstrong and Aldrin 66 | walking on the lunar surface while Collins remained in orbit. 67 | -------------------------------------------------------------------------------- 68 | Unsupported Claims: None 69 | -------------------------------------------------------------------------------- 70 | Context used: 71 | [1] Apollo 11: Apollo 11 was the American spaceflight that first landed humans... 72 | [2] Neil Armstrong: Neil Alden Armstrong was an American astronaut... 73 | ``` 74 | 75 | ### Customizing Questions 76 | 77 | Edit the `question` variable in `fact_check_rag.py`: 78 | 79 | ```python 80 | question = "Who discovered penicillin and in which year was it first reported?" 81 | # or 82 | question = "When was the first FIFA World Cup held, and where?" 83 | ``` 84 | 85 | ### Configuration Options 86 | 87 | Adjust the RAG parameters: 88 | 89 | ```python 90 | program = FactCheckedRAG( 91 | k_passages=4, # Number of Wikipedia passages to retrieve 92 | max_attempts=3 # Maximum refinement iterations 93 | ) 94 | ``` 95 | 96 | Customize the Wikipedia retriever: 97 | 98 | ```python 99 | wiki_rm = WikipediaRetriever( 100 | max_chars_per_passage=1500, # Characters per passage 101 | language="en" # Wikipedia language code 102 | ) 103 | ``` 104 | 105 | ## Architecture 106 | 107 | ### Components 108 | 109 | - **WikipediaRetriever**: Custom retriever that searches Wikipedia and formats results for DSPy 110 | - **GenerateAnswer**: Signature for creating answers strictly from provided context 111 | - **VerifyAnswer**: Signature for identifying unsupported claims in answers 112 | - **FactCheckedRAG**: Main module combining retrieval, generation, and verification with refinement 113 | 114 | ### Key Technologies 115 | 116 | - [DSPy](https://github.com/stanfordnlp/dspy): Framework for programming language models 117 | - [Wikipedia API](https://pypi.org/project/wikipedia/): Python library for Wikipedia data 118 | - OpenAI GPT-4o-mini: Language model for generation and verification 119 | 120 | ## Advanced Usage 121 | 122 | ### Using Different LLMs 123 | 124 | Replace the OpenAI configuration with other supported DSPy models: 125 | 126 | ```python 127 | # Anthropic Claude 128 | lm = dspy.LM("anthropic/claude-3-5-sonnet-20241022", api_key=os.environ.get("ANTHROPIC_API_KEY")) 129 | 130 | # Local models 131 | lm = dspy.LM("ollama/llama2") 132 | ``` 133 | 134 | ### Custom Retrievers 135 | 136 | Extend the retriever for other knowledge sources: 137 | 138 | ```python 139 | class CustomRetriever: 140 | def __call__(self, query: str, k: int = 8): 141 | # Implement your retrieval logic 142 | # Must return list[dotdict] with `.long_text` attribute 143 | pass 144 | ``` 145 | 146 | ## Limitations 147 | 148 | - Relies on Wikipedia data quality and coverage 149 | - Answer quality depends on the underlying LLM's capabilities 150 | - May require multiple refinement attempts for complex questions 151 | - English Wikipedia by default (configurable) 152 | 153 | ## Contributing 154 | 155 | Contributions are welcome! Please feel free to submit issues or pull requests. 156 | 157 | ## License 158 | 159 | MIT License - see [LICENSE](LICENSE) for details. 160 | 161 | ## Acknowledgments 162 | 163 | - Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP 164 | - Uses the [Wikipedia API](https://pypi.org/project/wikipedia/) for knowledge retrieval 165 | 166 | ## Citation 167 | 168 | If you use this project in your research, please cite: 169 | 170 | ```bibtex 171 | @software{dspy_examples, 172 | title={DSPy Multi-Agent Research Pipeline}, 173 | author={Your Name}, 174 | year={2025}, 175 | url={https://github.com/raja-patnaik/dspy-examples} 176 | } 177 | ``` -------------------------------------------------------------------------------- /dspy-gepa-deidentification/minimal_gepa_deid.py: -------------------------------------------------------------------------------- 1 | import re 2 | import dspy 3 | 4 | 5 | # 0) Pick task + reflection models (reflection ≈ stronger) 6 | task_lm = dspy.LM("openai/gpt-4o-mini") 7 | reflect_lm = dspy.LM("openai/gpt-4o") 8 | dspy.configure(lm=task_lm) # global default LM for modules :contentReference[oaicite:8]{index=8} 9 | 10 | # 1) Signature: what the module does (not how to prompt) 11 | class DeIDSignature(dspy.Signature): 12 | """Rewrite an incident report to remove PII while preserving causal structure and action items.""" 13 | report = dspy.InputField(desc="Raw incident report text.") 14 | rules = dspy.InputField(desc="Redaction rules and required output format.") 15 | clean_report = dspy.OutputField( 16 | desc="Redacted report using [EMAIL], [PHONE], [NAME]. Keep 'Root cause:' + 'Action items:' and bullets." 17 | ) 18 | 19 | # 2) Module: we’ll let GEPA evolve its internal instructions 20 | class DeIDProgram(dspy.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.rewriter = dspy.ChainOfThought(DeIDSignature) # adds .reasoning field to the prediction :contentReference[oaicite:9]{index=9} 24 | def forward(self, report, rules): 25 | return self.rewriter(report=report, rules=rules) 26 | 27 | student = DeIDProgram() 28 | 29 | # 3) Tiny “dataset”: GEPA doesn’t require labels, just examples to evaluate on 30 | RULES = """Redact emails, phone numbers, and full names. Use placeholders [EMAIL], [PHONE], [NAME]. 31 | Keep section headers and bullets. Output format: 32 | Root cause: ... 33 | Action items: ... 34 | - bullets for action items""" 35 | 36 | trainset = [ 37 | dspy.Example( 38 | report="Root cause: Alice Chen emailed ops (alice.chen@acme.io).\nAction items:\n- Call +1 (415) 555-0199 to notify vendor.", 39 | rules=RULES 40 | ).with_inputs("report", "rules"), 41 | dspy.Example( 42 | report="Root cause: Misconfigured S3 bucket by Bob A.\nAction items:\n- Rotate keys\n- email secops@company.com with incident ID 12345", 43 | rules=RULES 44 | ).with_inputs("report", "rules"), 45 | ] 46 | 47 | devset = [ 48 | dspy.Example( 49 | report="Root cause: OT sensor alert phoned to 212-555-0101 by Carol Q.\nAction items:\n- File ticket\n- email ops@example.org", 50 | rules=RULES 51 | ).with_inputs("report", "rules"), 52 | ] 53 | # Note: .with_inputs tells DSPy which fields are inputs for evaluation/compilation. :contentReference[oaicite:10]{index=10} 54 | 55 | # 4) Metric with feedback: score + *text* guidance for GEPA 56 | EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}") 57 | PHONE = re.compile(r"(?:\\+?\\d{1,3}[-. (]*)?\\d{3}[-. )]*\\d{3}[-. ]*\\d{4}") 58 | NAME = re.compile(r"\\b([A-Z][a-z]+ [A-Z][a-z]+)\\b") 59 | 60 | def pii_metric(gold, pred, trace=None, pred_name=None, pred_trace=None): 61 | text = (pred.clean_report or "").strip() 62 | leaks = [] 63 | if EMAIL.search(text): 64 | leaks.append("email") 65 | if PHONE.search(text): 66 | leaks.append("phone") 67 | if NAME.search(gold.report) and "[NAME]" not in text: 68 | leaks.append("name") 69 | 70 | keeps_root = "Root cause:" in text 71 | keeps_actions = "Action items:" in text 72 | 73 | # Score ∈ [0,1]: 0.6 for zero leaks + 0.2 each for keeping the two sections 74 | score = (0.6 if not leaks else 0.0) + (0.2 if keeps_root else 0.0) + (0.2 if keeps_actions else 0.0) 75 | 76 | feedback = [] 77 | if leaks: 78 | feedback.append(f"PII leaked: {', '.join(leaks)}. Replace PII with [EMAIL], [PHONE], [NAME].") 79 | if not keeps_root or not keeps_actions: 80 | missing = [] 81 | if not keeps_root: 82 | missing.append("keep 'Root cause:'") 83 | if not keeps_actions: 84 | missing.append("keep 'Action items:'") 85 | feedback.append("Also " + " and ".join(missing) + ".") 86 | if not feedback: 87 | feedback.append("Great: no PII and structure preserved. Prefer succinct edits; avoid adding facts.") 88 | 89 | return dspy.Prediction(score=score, feedback=" ".join(feedback)) # GEPA reads this feedback to evolve instructions. 90 | 91 | 92 | # Slightly stricter composite metric 93 | def composite_pii_metric(gold, pred, trace=None, pred_name=None, pred_trace=None): 94 | text = (pred.clean_report or "").strip() 95 | issues = [] 96 | 97 | # 1) PII leak checks (extend with better detectors as needed) 98 | leaks = [] 99 | if EMAIL.search(text): 100 | leaks.append("email") 101 | if PHONE.search(text): 102 | leaks.append("phone") 103 | if NAME.search(gold.report) and "[NAME]" not in text: 104 | leaks.append("name") 105 | if leaks: 106 | issues.append(f"PII leaked: {', '.join(leaks)}; replace with placeholders.") 107 | 108 | # 2) Structure invariants 109 | if "Root cause:" not in text: 110 | issues.append("Missing header: 'Root cause:'.") 111 | if "Action items:" not in text: 112 | issues.append("Missing header: 'Action items:'.") 113 | 114 | # 3) Formatting: require bullets for action items 115 | if "Action items:" in text: 116 | after = text.split("Action items:", 1)[1] 117 | if "-" not in after and "\n•" not in after: 118 | issues.append("Action items must be bulleted with '-' or '•'.") 119 | 120 | # 4) No fabrication: forbid adding new emails/phones beyond placeholders 121 | hallucination = EMAIL.findall(text) or PHONE.findall(text) 122 | if hallucination: 123 | issues.append("Do not introduce new PII; use placeholders only.") 124 | 125 | # Score scheme 126 | base = 1.0 127 | penalty = 0.25 * len(issues) # tune per your tolerance 128 | score = max(0.0, base - penalty) 129 | feedback = " ".join(issues) if issues else ( 130 | "Great: no leaks, headers intact, bullets present; keep edits minimal and factual." 131 | ) 132 | return dspy.Prediction(score=score, feedback=feedback) 133 | 134 | 135 | # 5) Run GEPA (reflection model must be provided) 136 | gepa = dspy.GEPA( 137 | # metric=pii_metric, 138 | metric=composite_pii_metric, 139 | auto="light", 140 | reflection_lm=reflect_lm, 141 | track_stats=True, 142 | track_best_outputs=True # also useful as an inference-time search to surface best candidates per input 143 | ) # See GEPA API for params like candidate_selection_strategy='pareto'. :contentReference[oaicite:12]{index=12} 144 | 145 | optimized = gepa.compile(student, trainset=trainset, valset=devset) 146 | 147 | # 6) Try it 148 | test_report = ( 149 | "Root cause: Dave Miller called 650-555-0000 to report breach.\n" 150 | "Action items:\n- email dave@contoso.com\n- notify legal" 151 | ) 152 | print(optimized(report=test_report, rules=RULES).clean_report) 153 | 154 | # Optional: Inspect the Pareto/best outputs per instance 155 | # print(optimized.detailed_results.best_outputs_valset) # requires track_best_outputs=True :contentReference[oaicite:13]{index=13} -------------------------------------------------------------------------------- /dspy-fact-checker/fact_check_rag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dspy 3 | import wikipedia 4 | from dspy.dsp.utils import dotdict 5 | 6 | 7 | # -------------------------------------------------------------------------------------- 8 | # Wikipedia API Retriever that returns objects with a `.long_text` field (DSPy expects this) 9 | # -------------------------------------------------------------------------------------- 10 | class WikipediaRetriever: 11 | """Simple retriever using the Wikipedia Python API.""" 12 | 13 | def __init__(self, max_chars_per_passage: int = 1500, language: str = "en"): 14 | self.max_chars = max_chars_per_passage 15 | wikipedia.set_lang(language) 16 | 17 | def __call__(self, query: str, k: int = 8, **kwargs): 18 | """Return a list[dotdict] where each item has a `.long_text` attribute.""" 19 | try: 20 | titles = wikipedia.search(query, results=k) or [] 21 | except Exception: 22 | titles = [] 23 | 24 | passages = [] 25 | 26 | for title in titles[:k]: 27 | page = None 28 | picked_title = title 29 | 30 | try: 31 | page = wikipedia.page(title, auto_suggest=False) 32 | except wikipedia.exceptions.DisambiguationError as e: 33 | # Resolve disambiguation by trying the first few options 34 | for opt in e.options[:3]: 35 | try: 36 | page = wikipedia.page(opt, auto_suggest=False) 37 | picked_title = opt 38 | break 39 | except Exception: 40 | continue 41 | except Exception: 42 | pass 43 | 44 | if not page: 45 | continue 46 | 47 | text = (page.summary or "").strip() 48 | if not text: 49 | continue 50 | 51 | if len(text) > self.max_chars: 52 | text = text[: self.max_chars].rstrip() + "..." 53 | 54 | # IMPORTANT: Return a structure with `.long_text` 55 | # We also embed title + URL into the long_text so they survive DSPy's mapping. 56 | long_text = f"{picked_title}: {text} (Source: {page.url})" 57 | passages.append( 58 | dotdict( 59 | {"long_text": long_text, "title": picked_title, "url": page.url} 60 | ) 61 | ) 62 | 63 | if len(passages) >= k: 64 | break 65 | 66 | return passages 67 | 68 | 69 | # -------------------------------------------------------------------------------------- 70 | # OpenAI LM configuration (uses OPENAI_API_KEY from env) 71 | # -------------------------------------------------------------------------------------- 72 | # You can also pass api_key=... directly to dspy.LM if you prefer 73 | lm = dspy.LM("openai/gpt-4o-mini", api_key=os.environ.get("OPENAI_API_KEY")) 74 | dspy.configure( 75 | lm=lm 76 | ) # Official pattern for configuring the default LM. :contentReference[oaicite:1]{index=1} 77 | 78 | # Configure the retriever for dspy.Retrieve 79 | wiki_rm = WikipediaRetriever(max_chars_per_passage=1500, language="en") 80 | dspy.settings.configure(rm=wiki_rm) 81 | 82 | 83 | # -------------------------------------------------------------------------------------- 84 | # Signatures 85 | # -------------------------------------------------------------------------------------- 86 | class GenerateAnswer(dspy.Signature): 87 | """Answer the question strictly from the provided context. 88 | - Use only facts present in the context. 89 | - If the context doesn't contain the answer, say you don't know. 90 | - Keep the answer concise (2-5 sentences). 91 | """ 92 | 93 | context = dspy.InputField(desc="retrieved passages from Wikipedia") 94 | question = dspy.InputField() 95 | answer = dspy.OutputField(desc="factual answer derived ONLY from the context") 96 | 97 | 98 | class VerifyAnswer(dspy.Signature): 99 | """Given 'context' and an 'answer', list any claims in the answer that are NOT supported by the context. 100 | Output 'None' if every claim is supported. 101 | """ 102 | 103 | context = dspy.InputField(desc="retrieved passages from Wikipedia") 104 | answer = dspy.InputField(desc="candidate answer to verify") 105 | unsupported_claims = dspy.OutputField( 106 | desc="List unsupported claims or 'None' if fully supported." 107 | ) 108 | 109 | 110 | # -------------------------------------------------------------------------------------- 111 | # Self-correcting RAG with dspy.Refine 112 | # -------------------------------------------------------------------------------------- 113 | class FactCheckedRAG(dspy.Module): 114 | def __init__(self, k_passages: int = 4, max_attempts: int = 3): 115 | super().__init__() 116 | self.retrieve = dspy.Retrieve(k=k_passages) 117 | self.generate_answer = dspy.ChainOfThought(GenerateAnswer) 118 | self.verify_answer = dspy.ChainOfThought(VerifyAnswer) 119 | 120 | # Reward: 1.0 when verifier returns "None" 121 | def reward_fn(args, pred): 122 | context_text = args["context"] 123 | verification = self.verify_answer(context=context_text, answer=pred.answer) 124 | uc = (verification.unsupported_claims or "").strip().lower() 125 | return 1.0 if (uc == "" or uc == "none" or uc.startswith("none")) else 0.0 126 | 127 | # Retry generation with automatic feedback until reward meets threshold 128 | self.refine_generate = dspy.Refine( 129 | module=self.generate_answer, 130 | N=max_attempts, 131 | reward_fn=reward_fn, 132 | threshold=1.0, 133 | ) 134 | 135 | def forward(self, question: str): 136 | # Retrieve evidence (DSPy maps dotdicts -> list[str] via `.long_text`) 137 | passages = self.retrieve(question).passages # List[str] 138 | context_text = "\n\n".join([f"[{i + 1}] {p}" for i, p in enumerate(passages)]) 139 | 140 | # Generate (and refine if needed) until verified 141 | pred = self.refine_generate(context=context_text, question=question) 142 | answer = pred.answer 143 | 144 | # Final verification for reporting 145 | final_check = self.verify_answer(context=context_text, answer=answer) 146 | 147 | return dspy.Prediction( 148 | answer=answer, 149 | context=context_text, 150 | unsupported_claims=final_check.unsupported_claims, 151 | ) 152 | 153 | 154 | # -------------------------------------------------------------------------------------- 155 | # Run 156 | # -------------------------------------------------------------------------------------- 157 | if __name__ == "__main__": 158 | program = FactCheckedRAG(k_passages=4, max_attempts=3) 159 | 160 | # Try any well-known topic 161 | question = ( 162 | "When did Apollo 11 land on the Moon, and who were the astronauts involved?" 163 | ) 164 | # question = "Who discovered penicillin and in which year was it first reported?" 165 | # question = "When was the first FIFA World Cup held, and where?" 166 | 167 | result = program(question) 168 | 169 | print(f"\nQuestion: {question}") 170 | print("-" * 80) 171 | print("Final Answer:\n", result.answer) 172 | print("-" * 80) 173 | print("Unsupported Claims:", result.unsupported_claims) 174 | print("-" * 80) 175 | print("Context used:\n", result.context) 176 | -------------------------------------------------------------------------------- /dspy-gepa-sql-generator/README.md: -------------------------------------------------------------------------------- 1 | # DSPy Natural Language to SQL with GEPA 2 | 3 | A demonstration of using [DSPy](https://github.com/stanfordnlp/dspy) with the GEPA optimizer to automatically improve prompts for converting natural language questions into SQL queries. The system learns to generate safe, correct SQL through iterative reflection and feedback. 4 | 5 | ## Features 6 | 7 | - **Automatic Prompt Optimization**: Uses GEPA to evolve SQL generation instructions 8 | - **Safety Validation**: Blocks DDL/DML operations and enforces SELECT-only queries 9 | - **Execution Verification**: Validates SQL syntax and execution correctness 10 | - **Result Correctness**: Compares query results against expected outputs 11 | - **Heuristic Guidance**: Provides feedback on ordering, limits, aliases, and output shape 12 | - **Zero-Shot Learning**: Improves from input/output pairs without labeled prompts 13 | 14 | ## How It Works 15 | 16 | 1. **Schema Description**: Creates a compact representation of database tables with sample data 17 | 2. **Query Generation**: Uses DSPy ChainOfThought to generate SQL from natural language 18 | 3. **Multi-Level Validation**: Custom metric checks safety, execution, and correctness 19 | 4. **GEPA Optimization**: Automatically refines prompts based on performance feedback 20 | 5. **Iterative Improvement**: Learns from mistakes to generate better queries 21 | 22 | The system uses a comprehensive metric that scores queries on: 23 | - **Safety** (40%): Only SELECT/WITH statements, no forbidden operations 24 | - **Execution** (30%): SQL must run without errors 25 | - **Correctness** (30%): Results must match expected output 26 | - **Heuristic penalties**: Deducted for missing ORDER BY, LIMIT, DISTINCT, etc. 27 | 28 | ## Requirements 29 | 30 | - Python 3.13+ 31 | - OpenAI API key (or compatible LLM provider) 32 | - DSPy with GEPA optimizer support 33 | 34 | ## Installation 35 | 36 | 1. Change to the project folder after cloning the repo: 37 | ```bash 38 | cd dspy-gepa-sql-generator 39 | ``` 40 | 41 | 2. Install dependencies using [uv](https://github.com/astral-sh/uv): 42 | ```bash 43 | uv sync 44 | ``` 45 | 46 | Or with pip: 47 | ```bash 48 | pip install dspy-ai 49 | ``` 50 | 51 | 3. Set your OpenAI API key: 52 | ```bash 53 | export OPENAI_API_KEY="your-api-key-here" # Linux/Mac 54 | # OR 55 | set OPENAI_API_KEY=your-api-key-here # Windows 56 | ``` 57 | 58 | ## Usage 59 | 60 | Run the example: 61 | ```bash 62 | uv run nl2sql_gepa.py 63 | ``` 64 | 65 | The script will: 66 | 1. Show baseline performance on development queries 67 | 2. Run GEPA optimization on training queries 68 | 3. Display post-optimization performance improvements 69 | 4. Show before/after comparison on sample queries 70 | 71 | ### Example Output 72 | 73 | ``` 74 | == Baseline on devset == 75 | 76 | Q: Return the top 3 authors by total copies sold in 2024 (name + total_sold), descending. 77 | SQL: 78 | SELECT a.name, SUM(s.sold) FROM authors a 79 | JOIN books b ON b.author_id = a.id 80 | JOIN sales s ON s.book_id = b.id 81 | WHERE s.year = 2024 82 | GROUP BY a.name 83 | Score: 0.600 84 | Feedback: 85 | Use ORDER BY ... DESC for descending/top/most queries. 86 | Add LIMIT 3 as requested (top 3). 87 | 88 | ... 89 | 90 | == PostGEPA on devset == 91 | 92 | Q: Return the top 3 authors by total copies sold in 2024 (name + total_sold), descending. 93 | SQL: 94 | SELECT a.name, SUM(s.sold) AS total_sold 95 | FROM authors a 96 | JOIN books b ON b.author_id = a.id 97 | JOIN sales s ON s.book_id = b.id 98 | WHERE s.year = 2024 99 | GROUP BY a.name 100 | ORDER BY total_sold DESC 101 | LIMIT 3; 102 | Score: 1.000 103 | Feedback: 104 | Perfect score. Keep current strategy. 105 | ``` 106 | 107 | ## Configuration 108 | 109 | ### Model Settings 110 | 111 | Customize the models used for generation and reflection: 112 | 113 | ```python 114 | STUDENT_MODEL = os.getenv("DSPY_STUDENT_MODEL", "openai/gpt-4o-mini") 115 | REFLECT_MODEL = os.getenv("DSPY_REFLECT_MODEL", "openai/gpt-4o") 116 | ``` 117 | 118 | Set via environment variables: 119 | ```bash 120 | export DSPY_STUDENT_MODEL="openai/gpt-4o-mini" 121 | export DSPY_REFLECT_MODEL="openai/gpt-4o" 122 | ``` 123 | 124 | ### Database Schema 125 | 126 | The example uses an in-memory SQLite database with: 127 | - **authors**: id, name, country 128 | - **books**: id, title, year, author_id, genre, pages, price 129 | - **sales**: book_id, year, sold 130 | 131 | Modify `setup_db()` to use your own schema and data. 132 | 133 | ### Training Set 134 | 135 | The example includes 14 questions ranging from simple to complex: 136 | - Basic filtering and joins 137 | - Aggregations and grouping 138 | - Ordering and limiting 139 | - String matching and case handling 140 | - Percentage calculations 141 | 142 | Adjust `questions_and_gold_sql()` to add your own examples. 143 | 144 | ### GEPA Parameters 145 | 146 | Tune optimization settings: 147 | 148 | ```python 149 | gepa = GEPA( 150 | metric=sql_metric_scalar, 151 | auto="medium", # Optimization depth: "light", "medium", "heavy" 152 | reflection_lm=reflection_lm, # Model for reflection 153 | track_stats=True, # Track optimization statistics 154 | ) 155 | ``` 156 | 157 | ## Architecture 158 | 159 | ### Components 160 | 161 | **NL2SQL Signature**: Defines the input/output specification for SQL generation 162 | - Input: Database schema with sample rows, natural language question 163 | - Output: Single safe SQL SELECT statement 164 | 165 | **NL2SQLProgram Module**: Wraps ChainOfThought for SQL generation 166 | 167 | **WikipediaRetriever**: Custom metric combining multiple validation layers: 168 | - Safety checks (forbidden keywords, statement type) 169 | - Execution validation (syntax, runtime errors) 170 | - Correctness verification (result matching) 171 | - Heuristic penalties (ordering, limits, aliases) 172 | 173 | **GEPA Optimizer**: Iteratively improves prompts through reflection 174 | 175 | ### Key Technologies 176 | 177 | - [DSPy](https://github.com/stanfordnlp/dspy): Framework for programming language models 178 | - [GEPA](https://dspy.ai/api/optimizers/GEPA/overview/): Generalized Evolution of Prompting via Adaptation 179 | - SQLite: Lightweight database for validation 180 | - OpenAI GPT-4o/GPT-4o-mini: Language models for generation and reflection 181 | 182 | ## Advanced Usage 183 | 184 | ### Custom Databases 185 | 186 | Replace the in-memory SQLite database with your own: 187 | 188 | ```python 189 | def setup_db() -> sqlite3.Connection: 190 | conn = sqlite3.connect("your_database.db") 191 | return conn 192 | ``` 193 | 194 | ### Different LLM Providers 195 | 196 | Use other DSPy-supported models: 197 | 198 | ```python 199 | # Anthropic Claude 200 | student_lm = dspy.LM("anthropic/claude-3-5-sonnet-20241022") 201 | 202 | # Local models 203 | student_lm = dspy.LM("ollama/llama3") 204 | ``` 205 | 206 | ### Metric Customization 207 | 208 | Adjust scoring weights in `sql_metric()`: 209 | 210 | ```python 211 | # Current weights: 212 | # +0.4 for safety 213 | # +0.3 for execution 214 | # +0.3 for correctness 215 | # -0.05 for each heuristic violation 216 | ``` 217 | 218 | ## Limitations 219 | 220 | - Requires gold SQL examples for correctness validation (can work without, but with reduced signal) 221 | - Limited to SELECT queries (DML/DDL operations are blocked) 222 | - Performance depends on LLM capabilities and schema complexity 223 | - English language queries by default 224 | 225 | ## Contributing 226 | 227 | Contributions are welcome! Please feel free to submit issues or pull requests. 228 | 229 | ## License 230 | 231 | MIT License - see [LICENSE](LICENSE) for details. 232 | 233 | ## Acknowledgments 234 | 235 | - Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP 236 | - Uses the [GEPA optimizer](https://dspy.ai/api/optimizers/GEPA/overview/) for automatic prompt improvement 237 | -------------------------------------------------------------------------------- /dspy-gepa-researcher/README.md: -------------------------------------------------------------------------------- 1 | # DSPy Multi-Agent Research Pipeline 2 | 3 | An intelligent, multi-agent research pipeline that autonomously conducts web research and generates comprehensive, citation-backed reports using DSPy, LangGraph, and the Exa search API. 4 | 5 | ## Features 6 | 7 | - **> Multi-Agent Architecture**: Coordinated agents for query planning, search, summarization, writing, and review using LangGraph 8 | - **= 9 | Smart Web Research**: Powered by Exa API for high-quality web search with full-text retrieval 10 | - **=� Automated Writing**: Generates polished Markdown reports with proper citations and references 11 | - **<� Prompt Optimization**: Optional GEPA (Generalized Efficient Prompt Adaptation) for automatic prompt tuning 12 | - **= Iterative Research**: Gap analysis determines when additional research is needed 13 | - ** Quality Assurance**: Built-in review and revision cycle for report quality 14 | - **=� Structured Citations**: Global citation management with numbered references 15 | 16 | ## Architecture 17 | 18 | The pipeline uses a graph-based workflow with the following stages: 19 | 20 | 1. **Query Planning** � Generate diverse search queries with operators (site:, intitle:, quoted phrases) 21 | 2. **Parallel Search** � Execute queries via Exa API and extract full-text content 22 | 3. **Summarization** � Convert sources into cited evidence bullets 23 | 4. **Gap Analysis** � Determine if more research is needed (up to 2 rounds) 24 | 5. **Section Writing** � Draft sections with proper citations in parallel 25 | 6. **Assembly & Review** � Combine sections, renumber citations globally, review quality 26 | 7. **Revision** � Apply suggestions if quality checks fail 27 | 28 | ### Technology Stack 29 | 30 | - **[DSPy](https://github.com/stanfordnlp/dspy)**: Framework for programming language models with signatures 31 | - **[LangGraph](https://github.com/langchain-ai/langgraph)**: Orchestration for multi-agent workflows with parallelism 32 | - **[Exa API](https://exa.ai/)**: Neural search engine for web research 33 | - **Google Gemini**: LLM backend (2.5 Flash for research, 2.5 Pro for writing) 34 | - **GEPA**: Efficient prompt optimization using reflection 35 | 36 | ## Requirements 37 | 38 | - Python 3.10+ 39 | - API Keys: 40 | - Google Gemini API key 41 | - Exa API key (get from [dashboard.exa.ai](https://dashboard.exa.ai)) 42 | 43 | ## Installation 44 | 45 | ```bash 46 | # Clone the repository 47 | cd dspy-gepa-researcher 48 | 49 | # Install dependencies using uv (recommended) 50 | uv add install dspy langgraph exa-py python-dateutil pydantic 51 | 52 | # Or with pip 53 | pip install dspy langgraph exa-py python-dateutil pydantic 54 | ``` 55 | 56 | ## Configuration 57 | 58 | Set the required environment variables: 59 | 60 | ```bash 61 | export GEMINI_API_KEY="your-gemini-api-key" 62 | export EXA_API_KEY="your-exa-api-key" 63 | 64 | # Optional configuration 65 | export GEMINI_WRITER_MODEL="gemini/gemini-flash-latest" # Default 66 | export GEMINI_RESEARCH_MODEL="gemini/gemini-flash-latest" # Default 67 | export RR_MAX_ROUNDS="2" # Research rounds 68 | export RR_SEARCH_K="6" # Results per query 69 | export RR_MAX_CHARS="12000" # Max chars per source 70 | ``` 71 | 72 | ## Usage 73 | 74 | ### Basic Usage 75 | 76 | ```python 77 | import asyncio 78 | from dspy_gepa_researcher import run_pipeline, SectionSpec 79 | 80 | # Define your research sections 81 | sections = [ 82 | SectionSpec( 83 | name="Executive Summary", 84 | instructions="In 180-250 words, summarize the most decision-relevant takeaways." 85 | ), 86 | SectionSpec( 87 | name="Market Landscape", 88 | instructions="Define the space; 2023-2025 trends; include 4+ specific figures with sources." 89 | ), 90 | SectionSpec( 91 | name="Key Players & Differentiation", 92 | instructions="Compare 5-7 players; list 1-2 distinctive capabilities each." 93 | ), 94 | ] 95 | 96 | # Run the pipeline 97 | topic = "State of Edge AI Acceleration (2024-2025)" 98 | result = asyncio.run(run_pipeline( 99 | topic=topic, 100 | sections=sections, 101 | optimization=False # Set to True to enable GEPA optimization 102 | )) 103 | 104 | # Report is saved to ./report.md 105 | ``` 106 | 107 | ### Running the Example 108 | 109 | ```bash 110 | uv run dspy_gepa_researcher.py 111 | ``` 112 | 113 | This will generate a report on "State of Edge AI Acceleration (2024-2025)" with 5 sections. 114 | 115 | ## How It Works 116 | 117 | ### 1. Query Generation 118 | The `QUERY_GEN` agent creates 4-8 diverse search queries per section using: 119 | - Quoted phrases for exact matches 120 | - `site:` operators to target specific domains 121 | - `intitle:` to find relevant titles 122 | - Date ranges for time-specific research 123 | 124 | ### 2. Parallel Search & Summarization 125 | Each query is executed in parallel: 126 | - Exa API retrieves top-k documents with full text 127 | - `SUMMARIZER` agent extracts key facts with `[S#]` citations 128 | - Results are aggregated per section 129 | 130 | ### 3. Gap Analysis 131 | The `GAP_ANALYZER` agent reviews evidence for each section: 132 | - Determines if coverage is sufficient 133 | - Generates followup queries if needed 134 | - Proceeds to writing after max rounds or sufficient coverage 135 | 136 | ### 4. Section Writing 137 | Sections are written in parallel: 138 | - `WRITE_SECTION` drafts markdown with `[n]` citations 139 | - `CITE_FIXER` ensures proper citation format 140 | - Character count and citation stats are tracked 141 | 142 | ### 5. Assembly & Review 143 | - Citations are renumbered globally across all sections 144 | - `REVIEWER` agent checks for quality issues 145 | - `REVISER` agent applies suggestions if needed 146 | - Final report is saved with References section 147 | 148 | ## GEPA Optimization 149 | 150 | The pipeline supports automatic prompt optimization using GEPA (Generalized Efficient Prompt Adaptation). When enabled, it optimizes prompts for all 7 agents: 151 | 152 | 1. `QUERY_GEN` - Search query generation 153 | 2. `SUMMARIZER` - Source summarization 154 | 3. `WRITE_SECTION` - Section writing 155 | 4. `GAP_ANALYZER` - Coverage analysis 156 | 5. `CITE_FIXER` - Citation formatting 157 | 6. `REVIEWER` - Quality review 158 | 7. `REVISER` - Report revision 159 | 160 | Each agent is trained on 2-3 high-quality examples specific to its task. 161 | 162 | To enable optimization: 163 | ```python 164 | result = asyncio.run(run_pipeline(topic, sections, optimization=True)) 165 | ``` 166 | 167 | ## Output 168 | 169 | The pipeline generates: 170 | 171 | 1. **report.md**: Complete markdown report with: 172 | - All sections with proper headings 173 | - Inline numeric citations `[1]`, `[2]`, etc. 174 | - References section with full metadata 175 | - Source URLs, publication dates, and domains 176 | 177 | 2. **Console Logs**: Detailed progress tracking: 178 | ``` 179 | [QUERY] Planning search queries for 5 sections... 180 | [SEARCH] 'Market Size': "market size TAM 2024"... 181 | [SEARCH] � Found 6 documents 182 | [SEARCH] � Extracted 4 evidence bullets 183 | [GAP] Analyzing research coverage (Round 1/2)... 184 | [WRITE] Drafting section: 'Market Size' 185 | [WRITE] � 2341 chars, 8 citations 186 | [REVIEW] � Pass: true, Issues: 0, Suggestions: 2 187 | ``` 188 | 189 | 3. **Evaluation Metrics**: 190 | - Overall quality score 191 | - Citation count 192 | - Character count 193 | - Structure checks 194 | 195 | ## Project Structure 196 | 197 | ``` 198 | dspy-gepa-researcher/ 199 | dspy_gepa_researcher.py # Main pipeline implementation 200 | README.md # This file 201 | report.md # Generated output (after running) 202 | LICENSE # MIT License 203 | ``` 204 | 205 | ## Customization 206 | 207 | ### Adding New Sections 208 | 209 | ```python 210 | sections.append(SectionSpec( 211 | name="Technology Stack", 212 | instructions="List 5+ technologies with adoption rates, version info, and benchmarks" 213 | )) 214 | ``` 215 | 216 | ### Adjusting Research Depth 217 | 218 | ```python 219 | # More queries per section (default: 6) 220 | export RR_SEARCH_K="10" 221 | 222 | # More research rounds (default: 2) 223 | export RR_MAX_ROUNDS="3" 224 | ``` 225 | 226 | ### Changing Models 227 | 228 | ```python 229 | export GEMINI_WRITER_MODEL="gemini/gemini-2.5-pro-preview-03-25" 230 | export GEMINI_RESEARCH_MODEL="gemini/gemini-2.5-flash-preview-03-25" 231 | ``` 232 | 233 | ## Limitations 234 | 235 | - Requires active API keys (Gemini and Exa) 236 | - Token costs scale with number of sections and research rounds 237 | - Gemini models have output limits (8,192 tokens max per response) 238 | - Quality depends on Exa search result quality 239 | 240 | ## Contributing 241 | 242 | Contributions are welcome! Please feel free to submit a Pull Request. 243 | 244 | ## License 245 | 246 | MIT License - see [LICENSE](LICENSE) for details. 247 | 248 | ## Acknowledgments 249 | 250 | - Built with [DSPy](https://github.com/stanfordnlp/dspy) by Stanford NLP 251 | - Powered by [Exa](https://exa.ai/) neural search 252 | - Orchestrated with [LangGraph](https://github.com/langchain-ai/langgraph) 253 | 254 | ## Citation 255 | 256 | If you use this project in your research, please cite: 257 | 258 | ```bibtex 259 | @software{dspy_examples, 260 | title={DSPy Multi-Agent Research Pipeline}, 261 | author={Your Name}, 262 | year={2025}, 263 | url={https://github.com/raja-patnaik/dspy-examples} 264 | } 265 | ``` 266 | -------------------------------------------------------------------------------- /dspy-gepa-sql-generator/nl2sql_gepa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sqlite3 4 | import inspect 5 | from typing import List, Tuple, Optional 6 | 7 | import dspy 8 | 9 | # ----------------------------------------------------------------------------- 10 | # 0) Model configuration 11 | # ----------------------------------------------------------------------------- 12 | STUDENT_MODEL = os.getenv("DSPY_STUDENT_MODEL", "openai/gpt-4o-mini") 13 | REFLECT_MODEL = os.getenv("DSPY_REFLECT_MODEL", "openai/gpt-4o") 14 | 15 | student_lm = dspy.LM(STUDENT_MODEL, temperature=0.2, max_tokens=800) 16 | reflection_lm = dspy.LM(REFLECT_MODEL, temperature=0.8, max_tokens=2000) 17 | dspy.configure(lm=student_lm) 18 | 19 | # GEPA import (newer/older DSPy layouts) 20 | try: 21 | from dspy import GEPA 22 | except Exception: 23 | try: 24 | from dspy.teleprompt import GEPA # older path 25 | except Exception: 26 | GEPA = None 27 | 28 | 29 | # ----------------------------------------------------------------------------- 30 | # 1) Tiny SQLite database and schema description 31 | # ----------------------------------------------------------------------------- 32 | def setup_db() -> sqlite3.Connection: 33 | conn = sqlite3.connect(":memory:") 34 | c = conn.cursor() 35 | 36 | c.executescript(""" 37 | CREATE TABLE authors ( 38 | id INTEGER PRIMARY KEY, 39 | name TEXT NOT NULL, 40 | country TEXT NOT NULL 41 | ); 42 | CREATE TABLE books ( 43 | id INTEGER PRIMARY KEY, 44 | title TEXT NOT NULL, 45 | year INTEGER NOT NULL, 46 | author_id INTEGER NOT NULL, 47 | genre TEXT NOT NULL, 48 | pages INTEGER NOT NULL, 49 | price REAL NOT NULL, 50 | FOREIGN KEY(author_id) REFERENCES authors(id) 51 | ); 52 | CREATE TABLE sales ( 53 | book_id INTEGER NOT NULL, 54 | year INTEGER NOT NULL, 55 | sold INTEGER NOT NULL, 56 | FOREIGN KEY(book_id) REFERENCES books(id) 57 | ); 58 | """) 59 | 60 | authors = [ 61 | (1, "Margaret Atwood", "Canada"), 62 | (2, "Haruki Murakami", "Japan"), 63 | (3, "Chimamanda Ngozi Adichie", "Nigeria"), 64 | (4, "Neil Gaiman", "UK"), 65 | (5, "Alice Munro", "Canada"), 66 | ] 67 | c.executemany("INSERT INTO authors VALUES (?, ?, ?)", authors) 68 | 69 | books = [ 70 | (1, "The Handmaid's Tale", 1985, 1, "Dystopia", 311, 9.99), 71 | (2, "Kafka on the Shore", 2002, 2, "Magical Realism", 505, 14.99), 72 | (3, "American Gods", 2001, 4, "Fantasy", 465, 12.99), 73 | (4, "Half of a Yellow Sun", 2006, 3, "Historical", 448, 13.99), 74 | (5, "The Testaments", 2019, 1, "Dystopia", 419, 15.99), 75 | (6, "Norwegian Wood", 1987, 2, "Romance", 296, 10.99), 76 | (7, "Dear Life", 2012, 5, "Short Stories", 336, 11.99), 77 | (8, "Neverwhere", 1996, 4, "Fantasy", 370, 9.49), 78 | (9, "Oryx and Crake", 2003, 1, "Dystopia", 389, 11.49), 79 | ] 80 | c.executemany("INSERT INTO books VALUES (?, ?, ?, ?, ?, ?, ?)", books) 81 | 82 | sales = [ 83 | (1, 2024, 12000), 84 | (2, 2024, 15000), 85 | (3, 2024, 16000), 86 | (4, 2024, 11000), 87 | (5, 2024, 9000), 88 | (6, 2024, 13000), 89 | (7, 2024, 7000), 90 | (8, 2024, 8000), 91 | (9, 2024, 10000), 92 | ] 93 | c.executemany("INSERT INTO sales VALUES (?, ?, ?)", sales) 94 | 95 | conn.commit() 96 | return conn 97 | 98 | 99 | def describe_schema(conn: sqlite3.Connection, sample_rows: int = 2) -> str: 100 | """Create a compact, LM-friendly schema string with a couple of sample rows per table.""" 101 | c = conn.cursor() 102 | parts = [] 103 | for table in ["authors", "books", "sales"]: 104 | c.execute(f"PRAGMA table_info({table})") 105 | cols = [f"{row[1]}:{row[2]}" for row in c.fetchall()] 106 | parts.append(f"TABLE {table}({', '.join(cols)})") 107 | c.execute(f"SELECT * FROM {table} LIMIT {sample_rows}") 108 | rows = c.fetchall() 109 | parts.append(f"EXAMPLE_ROWS {table}: {rows}") 110 | return "\n".join(parts) 111 | 112 | 113 | # ----------------------------------------------------------------------------- 114 | # 2) DSPy program (Signature + Module) 115 | # ----------------------------------------------------------------------------- 116 | class NL2SQL(dspy.Signature): 117 | """Generate a single safe SQLite SELECT to answer the question from the given schema.""" 118 | 119 | schema = dspy.InputField(desc="SQLite schema and 1–2 sample rows per table") 120 | question = dspy.InputField(desc="Natural-language question about the data") 121 | sql = dspy.OutputField( 122 | desc=( 123 | "Only ONE statement. Start with SELECT or WITH. " 124 | "Use exact column/table names. " 125 | "Return only the SQL (no comments)." 126 | ) 127 | ) 128 | 129 | 130 | class NL2SQLProgram(dspy.Module): 131 | def __init__(self): 132 | super().__init__() 133 | self.generate = dspy.ChainOfThought( 134 | NL2SQL 135 | ) # hidden reasoning; SQL only as output 136 | 137 | def forward(self, schema: str, question: str): 138 | return self.generate(schema=schema, question=question) 139 | 140 | 141 | # ----------------------------------------------------------------------------- 142 | # 3) Questions + (optional) gold SQL 143 | # ----------------------------------------------------------------------------- 144 | def questions_and_gold_sql() -> List[Tuple[str, Optional[str]]]: 145 | base: List[Tuple[str, Optional[str]]] = [ 146 | ( 147 | "List the titles of books written by authors from Canada, alphabetically.", 148 | "SELECT b.title FROM books b JOIN authors a ON a.id=b.author_id " 149 | "WHERE a.country='Canada' ORDER BY b.title;", 150 | ), 151 | ( 152 | "Which author sold the most copies in 2024?", 153 | "SELECT a.name FROM authors a " 154 | "JOIN books b ON a.id=b.author_id " 155 | "JOIN sales s ON s.book_id=b.id " 156 | "WHERE s.year=2024 " 157 | "GROUP BY a.name ORDER BY SUM(s.sold) DESC LIMIT 1;", 158 | ), 159 | ( 160 | "How many books per genre? Return genre and count, count descending.", 161 | "SELECT genre, COUNT(*) AS n FROM books GROUP BY genre ORDER BY n DESC;", 162 | ), 163 | ( 164 | "What are the top 2 longest books by page count? Return title and pages.", 165 | "SELECT title, pages FROM books ORDER BY pages DESC LIMIT 2;", 166 | ), 167 | ( 168 | "Average price of books published in or after 2010. Return a single number.", 169 | "SELECT ROUND(AVG(price), 2) AS avg_price FROM books WHERE year >= 2010;", 170 | ), 171 | ( 172 | "List distinct countries represented by authors, alphabetically.", 173 | "SELECT DISTINCT country FROM authors ORDER BY country;", 174 | ), 175 | ( 176 | "For Haruki Murakami, what is the average pages of his books? Return name and avg pages.", 177 | "SELECT a.name, ROUND(AVG(b.pages), 1) AS avg_pages " 178 | "FROM authors a JOIN books b ON a.id=b.author_id " 179 | "WHERE a.name='Haruki Murakami' GROUP BY a.name;", 180 | ), 181 | ( 182 | "Find the cheapest Fantasy book (title + price).", 183 | "SELECT title, price FROM books WHERE genre='Fantasy' ORDER BY price ASC LIMIT 1;", 184 | ), 185 | ( 186 | "Return titles that contain the word 'the' (case-insensitive), alphabetically.", 187 | "SELECT title FROM books WHERE LOWER(title) LIKE '%the%' ORDER BY title;", 188 | ), 189 | ( 190 | "How many books did Margaret Atwood publish after 2000?", 191 | "SELECT COUNT(*) AS n FROM books b " 192 | "JOIN authors a ON a.id=b.author_id " 193 | "WHERE a.name='Margaret Atwood' AND year > 2000;", 194 | ), 195 | ] 196 | 197 | # Harder variations to create headroom (ordering, aliasing, shape, top-k) 198 | extras: List[Tuple[str, Optional[str]]] = [ 199 | ( 200 | "Return the top 3 authors by total copies sold in 2024 (name + total_sold), descending.", 201 | "SELECT a.name, SUM(s.sold) AS total_sold " 202 | "FROM authors a " 203 | "JOIN books b ON b.author_id = a.id " 204 | "JOIN sales s ON s.book_id = b.id " 205 | "WHERE s.year = 2024 " 206 | "GROUP BY a.name " 207 | "ORDER BY total_sold DESC " 208 | "LIMIT 3;", 209 | ), 210 | ( 211 | "List titles containing the word 'the' (case-insensitive), alphabetically, return them as column lower_title.", 212 | "SELECT LOWER(title) AS lower_title " 213 | "FROM books " 214 | "WHERE LOWER(title) LIKE '%the%' " 215 | "ORDER BY lower_title ASC;", 216 | ), 217 | ( 218 | "For each country, return country and number of authors as n_authors; sort by n_authors desc then country asc.", 219 | "SELECT country, COUNT(*) AS n_authors " 220 | "FROM authors " 221 | "GROUP BY country " 222 | "ORDER BY n_authors DESC, country ASC;", 223 | ), 224 | ( 225 | "What percent of books are Dystopia? Return a single number named pct_dystopia (1 decimal place).", 226 | "SELECT ROUND(100.0 * SUM(CASE WHEN genre='Dystopia' THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct_dystopia " 227 | "FROM books;", 228 | ), 229 | ] 230 | 231 | return base + extras 232 | 233 | 234 | def run_sql(conn: sqlite3.Connection, sql: str): 235 | cur = conn.cursor() 236 | cur.execute(sql) 237 | rows = cur.fetchall() 238 | cols = [d[0] for d in cur.description] if cur.description else [] 239 | return cols, rows 240 | 241 | 242 | def build_examples(conn: sqlite3.Connection, schema_text: str): 243 | """Build DSPy examples and precompute gold results when available.""" 244 | examples = [] 245 | for q, gold_sql in questions_and_gold_sql(): 246 | expected = None 247 | if gold_sql: 248 | cols, rows = run_sql(conn, gold_sql) 249 | ordered = "ORDER BY" in gold_sql.upper() 250 | expected = {"columns": cols, "rows": rows, "ordered": ordered} 251 | ex = dspy.Example( 252 | schema=schema_text, question=q, expected=expected 253 | ).with_inputs("schema", "question") 254 | examples.append(ex) 255 | return examples 256 | 257 | 258 | # ----------------------------------------------------------------------------- 259 | # 4) Metric: safety + execution + strict(er) correctness + heuristic penalties 260 | # ----------------------------------------------------------------------------- 261 | FORBIDDEN = re.compile( 262 | r"\b(INSERT|UPDATE|DELETE|DROP|ALTER|PRAGMA|ATTACH|DETACH|CREATE|REPLACE|VACUUM|TRIGGER|INDEX|VIEW)\b", 263 | flags=re.IGNORECASE, 264 | ) 265 | 266 | 267 | def _clean_sql(text: str) -> str: 268 | t = (text or "").strip() 269 | t = re.sub(r"^```(?:sql)?", "", t, flags=re.IGNORECASE).strip() 270 | t = re.sub(r"```$", "", t).strip() 271 | # Keep only the first statement; terminate neatly if a semicolon was present 272 | if ";" in t: 273 | t = t.split(";")[0].strip() + ";" 274 | return t 275 | 276 | 277 | def sql_metric( 278 | gold: dspy.Example, 279 | pred: dspy.Prediction, 280 | trace=None, 281 | pred_name=None, 282 | pred_trace=None, 283 | ): 284 | """ 285 | GEPA-friendly metric: returns {'score': float, 'feedback': str}. 286 | 287 | Scoring: 288 | +0.4 single safe SELECT/WITH, no forbidden tokens 289 | +0.3 executes without error 290 | +0.3 exact result match (rows+columns); if not exact but sets equal → +0.15 291 | Heuristic penalties (applied after success): -0.05 each for missing ORDER BY when asked, 292 | wrong direction, missing LIMIT k when asked, missing DISTINCT when asked, wrong output shape for "single number". 293 | """ 294 | sql_raw = getattr(pred, "sql", "") or "" 295 | sql = _clean_sql(sql_raw) 296 | question = getattr(gold, "question", "") 297 | ql = question.lower() 298 | 299 | score = 0.0 300 | fb = [] 301 | 302 | # 1) Safety / format 303 | if not (sql.lower().startswith("select") or sql.lower().startswith("with")): 304 | fb.append("SQL must start with SELECT or WITH.") 305 | elif FORBIDDEN.search(sql): 306 | fb.append("Forbidden tokens present (DDL/DML/PRAGMA/etc.).") 307 | else: 308 | score += 0.4 309 | 310 | # 2) Execution 311 | exec_cols, exec_rows, exec_err = [], [], None 312 | if score >= 0.4: 313 | try: 314 | conn = ( 315 | setup_db() 316 | ) # fresh DB prevents accidental writes (even though we forbid DDL/DML) 317 | cur = conn.cursor() 318 | cur.execute(sql) 319 | exec_rows = cur.fetchall() 320 | exec_cols = [d[0] for d in cur.description] if cur.description else [] 321 | score += 0.3 322 | except Exception as e: 323 | exec_err = str(e) 324 | fb.append(f"Execution error: {exec_err}") 325 | 326 | # 2.1) Heuristic penalties open headroom for GEPA 327 | penalty = 0.0 328 | if exec_err is None: 329 | su = sql.upper() 330 | 331 | asks_order = any( 332 | w in ql 333 | for w in [ 334 | "alphabet", 335 | "ascending", 336 | "descending", 337 | "top", 338 | "highest", 339 | "lowest", 340 | "most", 341 | ] 342 | ) 343 | if asks_order and "ORDER BY" not in su: 344 | penalty += 0.05 345 | fb.append("Question asks for ordering; add an ORDER BY.") 346 | 347 | if any(w in ql for w in ["descending", "highest", "top", "most", "largest"]): 348 | if "ORDER BY" in su and "DESC" not in su: 349 | penalty += 0.05 350 | fb.append("Use ORDER BY ... DESC for descending/top/most queries.") 351 | 352 | if "alphabet" in ql or "ascending" in ql: 353 | if "ORDER BY" in su and "DESC" in su: 354 | penalty += 0.05 355 | fb.append("Use ORDER BY ... ASC for alphabetical/ascending queries.") 356 | 357 | m = re.search(r"\btop\s+(\d+)\b", ql) 358 | if m: 359 | k = int(m.group(1)) 360 | if f"LIMIT {k}" not in su: 361 | penalty += 0.05 362 | fb.append(f"Add LIMIT {k} as requested (top {k}).") 363 | 364 | if "distinct" in ql and "DISTINCT" not in su: 365 | penalty += 0.05 366 | fb.append("Add DISTINCT as requested.") 367 | 368 | if "single number" in ql: 369 | if not (len(exec_rows) == 1 and len(exec_cols) == 1): 370 | penalty += 0.05 371 | fb.append( 372 | "Return exactly one column and one row for 'single number' requests." 373 | ) 374 | 375 | # 3) Correctness vs. gold (if provided) 376 | s_correct = 0.0 377 | gold_expected = getattr(gold, "expected", None) 378 | if exec_err is None and gold_expected: 379 | gold_rows = gold_expected["rows"] 380 | gold_cols = gold_expected["columns"] 381 | require_order = bool(gold_expected.get("ordered", False)) 382 | 383 | same_cols = exec_cols == gold_cols 384 | 385 | if require_order: 386 | same_rows = exec_rows == gold_rows 387 | else: 388 | same_rows = sorted(map(tuple, exec_rows)) == sorted(map(tuple, gold_rows)) 389 | 390 | if same_cols and same_rows: 391 | s_correct = 0.3 392 | else: 393 | # partial credit: set-equality on rows and columns 394 | set_rows_equal = sorted(map(tuple, exec_rows)) == sorted( 395 | map(tuple, gold_rows) 396 | ) 397 | set_cols_equal = set(exec_cols) == set(gold_cols) 398 | if set_rows_equal and set_cols_equal: 399 | s_correct = 0.15 400 | if not same_cols: 401 | fb.append( 402 | "Column order/aliases differ; add explicit aliases to match expected columns." 403 | ) 404 | if require_order and not same_rows: 405 | fb.append("Row order differs; add the correct ORDER BY.") 406 | else: 407 | fb.append( 408 | f"Result mismatch. Expected (sample): {gold_rows[:3]} | Got (sample): {exec_rows[:3]}" 409 | ) 410 | 411 | elif exec_err is None and not gold_expected: 412 | # Unlabeled partial credit to keep signal flowing 413 | if len(exec_rows) > 0: 414 | s_correct = 0.15 415 | fb.append( 416 | "No gold available; granting partial credit for non-empty result." 417 | ) 418 | else: 419 | fb.append("Query returned 0 rows; consider joins/filters.") 420 | 421 | score += s_correct 422 | score = max(0.0, min(1.0, score - penalty)) # apply penalties, clamp to [0, 1] 423 | 424 | if score == 1.0: 425 | fb.append("Perfect score. Keep current strategy.") 426 | elif score < 0.4: 427 | fb.append("Rewrite as ONE safe SELECT/WITH; avoid DDL/DML/PRAGMA.") 428 | 429 | return {"score": float(score), "feedback": "\n".join(fb)} 430 | 431 | 432 | def sql_metric_scalar(gold, pred, trace=None, pred_name=None, pred_trace=None): 433 | """Numeric-only wrapper compatible with GEPA's 5-arg metric signature.""" 434 | return float( 435 | sql_metric( 436 | gold, 437 | pred, 438 | trace=trace, 439 | pred_name=pred_name, 440 | pred_trace=pred_trace, 441 | )["score"] 442 | ) 443 | 444 | 445 | def sql_metric_dual(gold, pred, trace=None, pred_name=None, pred_trace=None): 446 | """ 447 | Dual-mode metric for older GEPA builds: 448 | - Evaluation path (no trace): return float score 449 | - Reflection path (has trace/pred_name/pred_trace): return {'score', 'feedback'} 450 | """ 451 | res = sql_metric( 452 | gold, pred, trace=trace, pred_name=pred_name, pred_trace=pred_trace 453 | ) 454 | # If called by Evaluate (no reflection context), return float only: 455 | if (trace is None) and (pred_name is None) and (pred_trace is None): 456 | return float(res["score"]) 457 | # If called by GEPA reflection, return rich feedback: 458 | return res 459 | 460 | 461 | # ----------------------------------------------------------------------------- 462 | # 5) Main — build data, baseline eval, GEPA optimization, post eval 463 | # ----------------------------------------------------------------------------- 464 | def main(): 465 | if GEPA is None: 466 | raise RuntimeError( 467 | "Could not import GEPA from DSPy. Please upgrade/install a DSPy version that includes GEPA." 468 | ) 469 | 470 | # Build schema and examples 471 | conn = setup_db() 472 | schema_text = describe_schema(conn) 473 | all_examples = build_examples(conn, schema_text) 474 | 475 | # Train/dev split (tweak to taste) 476 | trainset = all_examples[:8] 477 | devset = all_examples[8:] 478 | 479 | # Baseline program 480 | program = NL2SQLProgram() 481 | 482 | # Baseline detailed feedback on devset 483 | print("== Baseline on devset ==") 484 | baseline_scores = [] 485 | for ex in devset: 486 | pred = program(schema=ex.schema, question=ex.question) 487 | res = sql_metric(ex, pred) 488 | baseline_scores.append(res["score"]) 489 | print("\nQ:", ex.question) 490 | print("SQL:\n", _clean_sql(getattr(pred, "sql", ""))) 491 | print(f"Score: {res['score']:.3f}") 492 | print("Feedback:\n" + res["feedback"]) 493 | print(f"\nBaseline mean score: {sum(baseline_scores) / len(baseline_scores):.3f}") 494 | 495 | # GEPA optimizer (reflective prompt evolution) 496 | # --- Compatibility wiring: metric must be numeric; feedback wired if supported. 497 | gepa_kwargs = dict( 498 | metric=sql_metric_scalar, # <-- numeric metric (prevents the dict summation crash) 499 | auto="medium", 500 | reflection_lm=reflection_lm, 501 | track_stats=True, 502 | add_format_failure_as_feedback=True, 503 | ) 504 | 505 | init_sig = inspect.signature(GEPA.__init__).parameters 506 | if "feedback_metric" in init_sig: 507 | gepa_kwargs["feedback_metric"] = sql_metric 508 | elif "feedback_producer" in init_sig: # some older builds 509 | gepa_kwargs["feedback_producer"] = sql_metric 510 | else: 511 | print( 512 | "⚠️ GEPA build does not expose 'feedback_metric'/'feedback_producer'. " 513 | "Optimization will use the scalar metric only (still works, less rich guidance)." 514 | ) 515 | 516 | gepa = GEPA(**gepa_kwargs) 517 | 518 | optimized_program = gepa.compile(program, trainset=trainset, valset=devset) 519 | 520 | # Post‑GEPA detailed feedback on devset 521 | print("\n== Post‑GEPA on devset ==") 522 | post_scores = [] 523 | for ex in devset: 524 | pred = optimized_program(schema=ex.schema, question=ex.question) 525 | res = sql_metric(ex, pred) 526 | post_scores.append(res["score"]) 527 | print("\nQ:", ex.question) 528 | print("SQL:\n", _clean_sql(getattr(pred, "sql", ""))) 529 | print(f"Score: {res['score']:.3f}") 530 | print("Feedback:\n" + res["feedback"]) 531 | print(f"\nPost‑GEPA mean score: {sum(post_scores) / len(post_scores):.3f}") 532 | 533 | # Quick before/after on a single sample 534 | sample = devset[0] 535 | before = program(schema=sample.schema, question=sample.question) 536 | after = optimized_program(schema=sample.schema, question=sample.question) 537 | print("\n== Before/After example ==") 538 | print("Q:", sample.question) 539 | print("Before SQL:\n", _clean_sql(getattr(before, "sql", ""))) 540 | print("After SQL:\n", _clean_sql(getattr(after, "sql", ""))) 541 | 542 | 543 | if __name__ == "__main__": 544 | main() 545 | -------------------------------------------------------------------------------- /dspy-gepa-researcher/report.md: -------------------------------------------------------------------------------- 1 | # Executive Summary 2 | 3 | The global economic and policy landscape is undergoing a fundamental transformation, moving away from the limitations of the Washington Consensus toward a new framework—the London Consensus—that prioritizes social factors, institutional resilience, and national happiness as core measures of success [1, 2, 3, 6]. 4 | 5 | Technological disruption is accelerating this change. The rapid adoption of AI and advanced computing is driving intense global competition for technological leadership [1]. This innovation underpins the exponential growth of Industry 5.0, a market valued at over $51.5 billion in 2023, which emphasizes human-centric, sustainable, and resilient industrial production. Policymakers must recognize the projected $29 trillion to $48 trillion revenue potential of emerging industry arenas by 2040 [2]. 6 | 7 | Simultaneously, geopolitical uncertainty poses significant risks, particularly within the global energy system, despite the ongoing transition driven by clean technologies like solar and wind [19, 20, 21, 22]. Addressing these fragilities requires massive capital deployment; an estimated $106 trillion in cumulative investment is needed through 2040 for global infrastructure alone [3]. Decision-makers must align strategic planning with the urgent climate imperative identified by the IPCC [4], focusing on transformative, equitable action while navigating internal policy divides and adapting investment strategies to manage geopolitical volatility and capitalize on AI-driven private markets. 8 | 9 | # Market Landscape 10 | 11 | The global technology market landscape is defined by rapid innovation, intense geopolitical competition, and a strategic shift toward efficiency-driven digital transformation [3, 4, 14]. Following macroeconomic challenges in 2022 and 2023, the sector is poised for robust growth, driven by enterprise investment in automation and strategic technologies [16, 4, 20]. 12 | 13 | ## 2023–2025 Trends and Growth Drivers 14 | 15 | The market space is dominated by software and IT services, which are expected to capture up to **70% of global tech spending** by 2027/2029 [3, 4, 5]. Overall worldwide IT spending was projected at $4.7 trillion in 2023 and is forecasted to grow 8% to reach **$5.1 trillion in 2024** [16, 4]. Forecasts for 2025 range up to $5.8 trillion [5]. 16 | 17 | The primary drivers of this growth are: 18 | 19 | 1. **Generative AI (GenAI) and Automation:** GenAI experienced explosive growth in 2023, with one-third of surveyed organizations using it regularly [6]. While initial adoption focused on proof-of-concept, its material impact on IT budgets is expected to begin in 2025 [7]. Global AI spending is projected to reach **$407 billion in 2025**, representing a 28.6% increase [5]. This demand is fueling the AI accelerators market, which is projected to expand at a robust **CAGR of 44.1%** between 2024 and 2029 [8]. 20 | 2. **Cloud Computing and Infrastructure:** Cloud spending remains a core growth engine. The cloud computing industry is forecast to grow to **$678 billion in 2025** [5]. Furthermore, the Data Center Systems segment is set to grow by 15.5% in 2025, driven by the need for increased computing power [9]. 21 | 3. **Cybersecurity and Modernization:** Organizations are prioritizing cybersecurity and the modernization of legacy systems to improve operational efficiency and address persistent IT talent shortages [1, 3, 5]. 22 | 23 | ## Macroeconomic Context 24 | 25 | While the technology outlook is optimistic, the global business landscape faces continued uncertainty. The 2025 outlook is characterized by both promise (potential soft landings and descending interest rates) and peril (intensifying geopolitical conflicts and trade tensions) [9, 11]. Geopolitical factors are increasingly shaping production choices, particularly in strategic areas like semiconductors and sovereign AI advancements [9, 31]. Investment opportunities are anticipated to be strongest in technology, renewable energy, and healthcare [10]. 26 | 27 | # Key Players & Differentiation 28 | 29 | The generative AI market is characterized by a crowded vendor landscape, broadly categorized into foundation model providers, infrastructure providers, and application developers [11]. Differentiation among foundation model providers is achieved through performance, safety focus, open-source strategy, and integration capabilities [8, 35]. 30 | 31 | The following table compares six leading foundation model developers based on their core offerings and strategic focus: 32 | 33 | | Player | Key Models | Distinctive Capabilities | 34 | | :--- | :--- | :--- | 35 | | **OpenAI** | GPT-4.1, GPT-4 Turbo | Industry-leading performance in English contexts; optimized and scalable integration via Microsoft Azure [1, 25]. | 36 | | **Google** | Gemini Pro, Gemma 2 | Comprehensive AI platform (Vertex AI); strong multimodal support and open-source model releases [25, 35]. | 37 | | **Anthropic** | Claude-3 Series | Focus on safe and ethical AI development, utilizing Constitutional AI principles [12]. | 38 | | **Meta** | Llama 3.1 | Leading provider of high-performing, open-source foundation models for community adoption [13]. | 39 | | **Mistral AI** | Mistral Large 2 | Focus on highly efficient, performant, and open models, often optimized for European markets [3, 35]. | 40 | | **Baidu** | Ernie-Bot 4 | Recognized as the leading LLM provider specifically tailored for Chinese language contexts [14]. | 41 | 42 | ## Objective Benchmarks and Metrics 43 | 44 | Differentiation is measured across several objective benchmarks, moving beyond simple performance scores to include safety, efficiency, and enterprise readiness: 45 | 46 | 1. **Contextual Performance Leadership:** Performance varies significantly across languages and tasks. GPT 4-Turbo is consistently identified as being at the forefront in English contexts, while Baidu’s Ernie-Bot 4 leads the field in Chinese contexts [14]. 47 | 2. **Technical Differentiation Metrics:** Models are rigorously tracked and benchmarked across numerous technical specifications. Key metrics used for comparison include Safety Rank, Jailbreaking Resistance, and specific task performance benchmarks such as GPQA and Math LiveBench [15]. 48 | 3. **Enterprise and API Capabilities:** Providers differentiate their offerings based on features critical for enterprise adoption. These include structured output, tool use, vision/audio understanding, prompt caching, and fine-tuning capabilities, often delivered through integrated cloud platforms like Google's Vertex AI or Microsoft Azure [23, 25]. 49 | 50 | # Risks & Open Questions 51 | 52 | ## Knowledge Gaps and Unforeseen Threats 53 | 54 | * **Unknown Unknowns:** Risks that are unforeseen and unanticipated because they fall outside the realm of current knowledge or past experience, often arising from complex interdependencies [16]. These are particularly dangerous in cybersecurity, where assets are unknown and thus impossible to secure [17]. 55 | * **Internal Blind Spots:** The most significant risks often stem from knowledge gaps that organizations are unaware they possess. Many so-called unknown unknowns were actually knowable if key information held by project personnel had been communicated to top decision-makers [18]. 56 | * **Data Analysis Failure:** Most companies analyze only 12% of collected data, leaving 88% of competitive threats and opportunities unnoticed, which can lead to significant losses and competitive blind spots [19]. 57 | * **Competitive Displacement:** Established firms risk being displaced by smaller rivals due to internal routinism, outdated R&D opinions, and limited perception focused only on themselves, failing to conduct fact-based assessments of competitor threats [23, 24]. 58 | 59 | ## Regulatory and Legal Friction 60 | 61 | * **Challenged Regulatory Authority:** The Supreme Court's decision to overrule the *Chevron* deference framework opens the door to new legal challenges against federal regulations, potentially having enduring effects on agencies' ability to implement policies [20]. 62 | * **Outdated Frameworks:** Developments in regenerative medicine creating 'hybrid therapies' (combining medicines, devices, and human cells) pose significant challenges to regulatory frameworks that traditionally regulated these components separately [21]. 63 | * **Global Hurdles:** Global regulatory hurdles for medical devices pose risks to innovation and market access [22]. Furthermore, regulatory challenges have been cited as the primary reason why certain payment innovation initiatives, such as Open Banking, have not fully met expectations [23]. 64 | * **Unregulated Services:** The traditional regulatory system for the legal profession is ineffective at fostering necessary collaboration with nonlaw professionals, especially as nonlawyers provide services online without regulation [24]. 65 | * **Regulatory Costs:** Optimal regulatory rule-making is complicated by the fact that firms can challenge regulations in court, forcing regulators to incur costs to preclude challenges or accept legal fees [25]. 66 | 67 | ## Technological Scaling and Systemic Failure 68 | 69 | * **AI Compute Bottlenecks:** Scaling computational resources for AI model training faces limitations over the next decade due to bottlenecks in the complex, internationally distributed supply chain for semiconductors [26]. Constraints in human capital and power are projected to limit cumulative AI chip manufacturing capacity, potentially reducing the likelihood of some near-term transformative AI scenarios [26]. 70 | * **Agentic Failure Modes:** Agentic AI systems are subject to various failure modes, including specific threats like a 'Memory poisoning attack' on an AI email assistant, which require analysis of their effects and potential mitigations [27]. 71 | * **Inevitable Failure:** Failure Mode Analysis (FMA) is based on the key tenet that failures are inevitable, even with multiple layers of resiliency, particularly in complex environments [28]. 72 | 73 | ## Ethical and Unintended Consequences 74 | 75 | * **Systemic Intervention Risks:** Interventions like Nature-based Solutions (NbS) carry systemic risks, including ecological side effects, socio-political inequities, and epistemological overconfidence, challenging the assumption that such interventions are inherently beneficial [29]. 76 | * **Sustainable Development Backlash:** Sustainable development initiatives often lead to unintended consequences within social-ecological systems, such as those related to conservation efforts or alternative livelihood programs [30]. 77 | * **Regulatory Side Effects:** Measures intended to combat money laundering and terrorist financing (AML/CFT) can result in unintended consequences, including derisking, financial exclusion, and the suppression of Non-Profit Organizations (NPOs) and human rights [31]. 78 | * **AI Ethical Challenges:** Emerging technologies, especially AI, are not ethically neutral [32]. Ethical risks associated with Artificial Intelligence in Education (AIED) are categorized across technology (e.g., algorithmic bias, privacy invasion), education (e.g., homogenized development), and society (e.g., digital divide, lack of accountability) [33]. 79 | * **Dynamic Ethical Factors:** Business leaders must be aware of dynamic ethical risk factors, which are often driven by global geopolitical and economic events (e.g., inflation, cost of living crises) that pose reputational, operational, and financial risks [34]. 80 | 81 | # Outlook (12–24 months) 82 | 83 | The 12-to-24-month outlook suggests a period of decelerating US growth, stabilizing inflation in developed markets, continued geopolitical complexity, and accelerated technological competition in key sectors. 84 | 85 | ### 1. US Monetary Easing and Decelerated Growth (2025–2026) 86 | 87 | The US economy is forecasted to experience decelerated growth, with real GDP projected at 0.9% in 2025, maintaining a 30% to 50% probability of recession [35]. This environment will prompt the Federal Reserve to implement further rate cuts. The median forecast projects the federal funds target range to reach 3.926% by the end of 2025 and 3.625% by the end of 2026 [2, 21]. Core PCE inflation is expected to end 2025 at 3.1% year-over-year [35]. 88 | 89 | * **Leading Indicators to Track:** Real GDP growth figures, Core PCE inflation rate, FOMC dot plot updates, and the unemployment rate (expected to stabilize around 4.4% in 2026) [2, 20]. 90 | 91 | ### 2. European Economic Recovery and Inflation Stabilization (2025–2026) 92 | 93 | The Euro area economy is expected to strengthen, with real GDP growth accelerating to 1.2% in 2025 and 1.4% in 2026 [36]. This recovery is anticipated to pull the Austrian economy out of its post-WWII recession by mid-2025 [37]. Headline HICP inflation in the Euro area is expected to decline and stabilize near 1.9% throughout both 2025 and 2026 [36]. 94 | 95 | * **Leading Indicators to Track:** Euro area HICP inflation, Euro area real GDP growth, and unemployment rates (expected to average 6.5% in 2025) [36]. 96 | 97 | ### 3. Equity Market Highs and Dollar Weakness (2025) 98 | 99 | US equity markets are projected to continue their upward trajectory, with the S&P 500 expected to close near 6,000 by year-end 2025, supported by double-digit earnings growth [38]. Concurrently, the US Dollar is expected to weaken against Emerging Market (EM) currencies as US economic exceptionalism fades [38]. Ongoing trade policy shifts are anticipated to cause a broad-based downshift in global growth while shifting inflation pressures toward the U.S. [38]. 100 | 101 | * **Leading Indicators to Track:** S&P 500 earnings growth reports, DXY index performance, and EM currency performance [38]. 102 | 103 | ### 4. Intensified Geopolitical and Regulatory Complexity (2025–2026) 104 | 105 | Overall uncertainty, driven by macroeconomic and geopolitical disruptions, is expected to continue or deepen through 2026 [6, 3]. Geopolitical complexity will be heightened by the global elections supercycle and new areas of competition, particularly Artificial Intelligence (AI) and the oceans [39]. Businesses must navigate diverging AI policies and potential sweeping US tariff proposals, which risk raising CPI inflation by over 1 percentage point [15, 17]. Strategic realignment and supply chain adjustments will be necessary to manage compliance risks and reliance on critical mineral supply chains [15, 17]. 106 | 107 | * **Leading Indicators to Track:** Implementation status of new US tariff proposals, regulatory divergence indices concerning AI, and critical mineral supply chain stability metrics [15, 17]. 108 | 109 | ### 5. Accelerated Growth in Next-Generation Biotherapeutics (2025–2026) 110 | 111 | The Global Biotechnology Market, valued at $1.8 trillion in 2024, is expected to sustain strong growth (13.543% CAGR through 2034) [40]. The Next Generation Biotherapeutics segment, specifically, is projected to grow at a 12.05% CAGR (2026–2030), fueled by technological advancements in platforms like CRISPR gene editing, mRNA delivery, and synthetic biology [41]. Global regulatory bodies are expected to continue expediting market access for these innovative treatments, shortening time-to-market [41]. 112 | 113 | * **Leading Indicators to Track:** Number of new Breakthrough Therapy Designations granted by regulatory bodies, R&D investment in synthetic biology platforms, and revenue growth of key biotechnology firms [41]. 114 | 115 | ## References 116 | [1] McKinsey technology trends outlook 2024 — mckinsey.com (published 2024-07-16). https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/the-top-trends-in-tech?utm_source=pocket_shared 117 | [2] The next big arenas of competition — mckinsey.com (published 2024-10-01). https://www.mckinsey.com/~/media/mckinsey/mckinsey%20global%20institute/our%20research/the%20next%20big%20arenas%20of%20competition/the-next-big-arenas-of-competition_final.pdf 118 | [3] Investing in the infrastructure of modern society — mckinsey.com (published 2025-09-09). https://www.mckinsey.com/industries/infrastructure/our-insights/the-infrastructure-moment 119 | [4] Climate Change 2023 Synthesis Report — ipcc.ch. https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_LongerReport.pdf 120 | [5] Technology Growth Statistics 2025: Market Size, AI, and Innovation — sqmagazine.co.uk (published 2025-05-14). https://sqmagazine.co.uk/technology-growth-statistics/ 121 | [6] The state of AI in 2023: Generative AI’s breakout year — mckinsey.com (published 2023-08-01). https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-state-of-ai-in-2023-generative-ais-breakout-year?utm_source=substack&utm_medium=email 122 | [7] Gartner Forecasts Worldwide IT Spending to Grow 8% in 2024 — gartner.com (published 2023-10-18). https://www.gartner.com/en/newsroom/press-releases/2023-10-18-gartner-forecasts-worldwide-it-spending-to-grow-8-percent-in-2024 123 | [8] AI Accelerators Market Research 2025-2029 — globenewswire.com (published 2025-08-15). https://www.globenewswire.com/news-release/2025/08/15/3134075/28124/en/AI-Accelerators-Market-Research-2025-2029-Market-to-Expand-at-44-1-CAGR-Driven-by-Sovereign-AI-and-Geopolitical-Factors.html 124 | [9] Gartner Forecasts Worldwide IT Spending to Grow 9.3% in 2025 — gartner.com (published 2024-10-23). https://www.gartner.com/en/newsroom/press-releases/2024-10-23-gartner-forecasts-worldwide-it-spending-to-grow-nine-point-three-percent-in-2025 125 | [10] Industry outlook 2025 — eiu.com (published 2024-10-24). https://www.eiu.com/n/campaigns/industry-outlook-2025/ 126 | [11] Navigating the generative AI vendor landscape — transformainsights.com (published 2024-01-30). https://transformainsights.com/research/reports/navigating-generative-ai-vendor-landscape 127 | [12] LLM Price Comparison | Compare AI Model Costs — llmpricecomparison.com (published 2025-01-01). https://llmpricecomparison.com/providers 128 | [13] The Vanguard of Open-Source LLMs: A Comprehensive Analysis (2024–2025) — medium.com (published 2025-08-04). https://medium.com/@haiderkhan6410/the-vanguard-of-open-source-llms-a-comprehensive-analysis-2024-2025-a5805592fe8f 129 | [14] Unveiling the Competitive Dynamics: A Comparative Evaluation of American and Chinese LLMs — papers.ssrn.com (published 2024-05-10). https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4823501 130 | [15] LLM Decision Hub - AI Model Rankings & Benchmarks — llmleaderboard.ai (published 2025-03-01). https://llmleaderboard.ai/leaderboard/ 131 | [16] Navigating the “Unknown Unknowns” Risks — linkedin.com (published 2024-10-23). https://www.linkedin.com/pulse/navigating-unknown-unknowns-risks-st-mmt-ipm-pmp-pmi-rmp-uustc 132 | [17] What are Unknown Unknowns | Cybersecurity Glossary — cycognito.com (published 2025-01-01). https://www.cycognito.com/glossary/unknown-unknowns.php 133 | [18] Looking in the right places to identify “unknown unknowns ... — i2insights.org (published 2019-11-05). https://i2insights.org/2019/11/05/detecting-unknown-unknowns-in-projects/ 134 | [19] Blind Spot: Uncovering Hidden Competitive Threats — octopusintelligence.com (published 2025-09-19). https://www.octopusintelligence.com/the-2-3-million-blind-spot-why-73-of-companies-miss-their-biggest-competitive-threats-until-its-too-late/ 135 | [20] Supreme Court Opens Door to New Legal Challenges ... — mcguirewoods.com (published 2024-07-03). https://www.mcguirewoods.com/client-resources/alerts/2024/7/supreme-court-opens-door-to-new-legal-challenges-to-federal-regulations-new-and-old/ 136 | [21] Legal and Regulatory Challenges for Emerging Regenerative... : Transplantation — journals.lww.com (published 2025-05-13). https://journals.lww.com/transplantjournal/fulltext/2024/05000/legal_and_regulatory_challenges_for_emerging.15.aspx 137 | [22] Global Regulatory Challenges for Medical Devices: Impact on Innovation and Market Access — mdpi.com (published 2024-10-12). https://www.mdpi.com/2076-3417/14/20/9304 138 | [23] UK Payment Innovations: Regulatory Hurdles in Open Banking — edgardunn.com (published 2024-11-01). https://www.edgardunn.com/articles/how-much-regulation-is-needed 139 | [24] Regulatory Hurdles--21st Century Law — michbar.org (published 2021-01-01). https://www.michbar.org/future/regulation 140 | [25] Regulatory Rule-Making with Legal Challenges — sciencedirect.com (published 2000-08-10). https://www.sciencedirect.com/science/article/pii/S0095069699911087 141 | [26] Scaling Limits to AI Chip Manufacturing — openreview.net (published 2025-05-13). https://openreview.net/pdf?id=jwhrNLkEuk 142 | [27] Taxonomy of Failure Mode in Agentic AI Systems — cdn-dynmedia-1.microsoft.com (published 2025-04-16). https://cdn-dynmedia-1.microsoft.com/is/content/microsoftcorp/microsoft/final/en-us/microsoft-brand/documents/Taxonomy-of-Failure-Mode-in-Agentic-AI-Systems-Whitepaper.pdf 143 | [28] Architecture strategies for performing failure mode analysis — learn.microsoft.com (published 2024-10-08). https://learn.microsoft.com/en-us/azure/well-architected/reliability/failure-mode-analysis 144 | [29] Rethinking Nature-Based Solutions: Unintended Consequences, Ancient Wisdom, and the Limits of Nature — mdpi.com (published 2025-06-13). https://www.mdpi.com/2073-445X/14/6/1272 145 | [30] Unintended consequences of sustainable development ... — ecologyandsociety.org (published 2022-05-26). https://ecologyandsociety.org/vol27/iss2/art10/ 146 | [31] Mitigating Unintended Consequences — fatf-gafi.org (published 2024-07-31). https://www.fatf-gafi.org/en/publications/Financialinclusionandnpoissues/Unintended-consequences-project.html 147 | [32] Ethical Considerations and Challenges of AI Adoption in Project Management — link.springer.com (published 2025-03-18). https://link.springer.com/chapter/10.1007/978-3-031-56310-2_16?error=cookies_not_supported&code=c39f2730-ee66-4e00-ac89-8e3a3ce785f6 148 | [33] a systematic review on identifying and mitigating ethical risks — nature.com (published 2025-07-16). https://www.nature.com/articles/s41599-025-05252-6 149 | [34] What are the ethical risk factors business leaders are most ... — ibe.org.uk (published 2024-02-19). https://www.ibe.org.uk/resource/what-are-the-ethical-risk-factors-business-leaders-are-most-concerned-about-in-2024.html 150 | [35] SIFMA 2025 Mid-Year Economic Survey — sifma.org (published 2025-07-11). https://www.sifma.org/wp-content/uploads/2024/12/1H25-Economic-Survey-Report_2025-07-11.pdf 151 | [36] The ECB Survey of Professional Forecasters - Fourth quarter of 2024 — ecb.europa.eu (published 2024-10-18). https://www.ecb.europa.eu/stats/ecb_surveys/survey_of_professional_forecasters/html/ecb.spf2024q4~ee6e2cd847.en.html 152 | [37] Austria is in its Third Year of Recession. Economic Outlook for 2025 and 2026 — wifo.ac.at (published 2025-05-16). https://www.wifo.ac.at/wp-content/uploads/upload-8802/roa_2025_05_economic_outlook.pdf 153 | [38] Mid-year market outlook 2025 | J.P. Morgan Research — jpmorgan.com (published 2025-07-01). https://www.jpmorgan.com/insights/global-research/outlook/mid-year-outlook 154 | [39] Top 10 geopolitical risks for 2024 — ey.com (published 2025-06-11). https://www.ey.com/en_gl/insights/geostrategy/2024-geostrategic-outlook 155 | [40] Biotechnology Market Outlook Report: Industry Size, Market Shares Data, Latest Trends, Insights, Growth Potential, CAGR Forecasts to 2034 — researchandmarkets.com (published 2025-01-01). https://www.researchandmarkets.com/reports/6027439/biotechnology-market-outlook-report-industry?srsltid=AfmBOoriaKomZjCF_-4pG3kRYUgNfiEPX3VKmlGYgtKGK_qZKNubnpLf 156 | [41] Next Generation Biotherapeutics Market to Grow with a CAGR of 12.05% through 2030 — techsciresearch.com (published 2025-01-01). https://techsciresearch.com/news/6646-next-generation-biotherapeutics-market.html -------------------------------------------------------------------------------- /dspy-gepa-researcher/dspy_gepa_researcher.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import re 5 | import json 6 | import asyncio 7 | import operator 8 | from typing import Any, Dict, List, Optional, Tuple, Annotated 9 | from urllib.parse import urlparse 10 | from pydantic import BaseModel, Field 11 | from dateutil import parser as dtparse 12 | 13 | from typing_extensions import TypedDict 14 | from langgraph.graph import StateGraph, START, END 15 | from langgraph.types import Send 16 | 17 | import dspy 18 | from dspy.teleprompt import GEPA 19 | 20 | from exa_py import Exa 21 | 22 | # ---------------------------- 23 | # Configuration 24 | # ---------------------------- 25 | 26 | MAX_ROUNDS = int(os.environ.get("RR_MAX_ROUNDS", "1")) # writer<->research loop rounds 27 | SEARCH_RESULTS_PER_QUERY = int(os.environ.get("RR_SEARCH_K", "6")) # per query 28 | MAX_CONTENT_CHARS_PER_SOURCE = int(os.environ.get("RR_MAX_CHARS", "12000")) 29 | 30 | WRITER_MODEL = os.environ.get("GEMINI_WRITER_MODEL", "gemini/gemini-flash-latest") 31 | RESEARCH_MODEL = os.environ.get("GEMINI_RESEARCH_MODEL", "gemini/gemini-flash-latest") 32 | REFLECTION_MODEL = os.environ.get("GEMINI_REFLECTION_MODEL", WRITER_MODEL) 33 | 34 | FALLBACK_WRITER = "gemini/gemini/gemini-flash-latest" 35 | FALLBACK_RESEARCH = "gemini/gemini/gemini-flash-latest" 36 | 37 | GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") 38 | EXA_API_KEY = os.environ.get("EXA_API_KEY") 39 | 40 | if not GEMINI_API_KEY: 41 | raise RuntimeError("GEMINI_API_KEY is not set.") 42 | if not EXA_API_KEY: 43 | raise RuntimeError("EXA_API_KEY is not set. Get one from dashboard.exa.ai") 44 | 45 | # Initialize Exa once (thread-safe to call via to_thread) 46 | EXA = Exa(EXA_API_KEY) 47 | 48 | # ---------------------------- 49 | # Utilities & data models 50 | # ---------------------------- 51 | 52 | def short_host(u: str) -> str: 53 | try: 54 | return urlparse(u).netloc.replace("www.", "") 55 | except Exception: 56 | return u 57 | 58 | def clamp(n: float, lo=0.0, hi=1.0) -> float: 59 | return max(lo, min(hi, n)) 60 | 61 | def safe_json_loads(s: str, fallback=None): 62 | try: 63 | return json.loads(s) 64 | except Exception: 65 | return fallback 66 | 67 | class SectionSpec(BaseModel): 68 | name: str 69 | instructions: str 70 | 71 | class SourceDoc(BaseModel): 72 | url: str 73 | title: Optional[str] = None 74 | site: Optional[str] = None 75 | published: Optional[str] = None 76 | content: Optional[str] = None 77 | 78 | class ResearchSummary(BaseModel): 79 | section: str 80 | query: str 81 | bullets: List[str] = Field(default_factory=list) 82 | sources: List[SourceDoc] = Field(default_factory=list) 83 | 84 | class ReviewReport(BaseModel): 85 | pass_checks: bool 86 | summary: str 87 | issues: List[str] = Field(default_factory=list) 88 | suggestions: List[str] = Field(default_factory=list) 89 | 90 | class EvalResult(BaseModel): 91 | score: float 92 | breakdown: Dict[str, float] 93 | notes: str 94 | 95 | # ---------------------------- 96 | # DSPy model setup 97 | # ---------------------------- 98 | 99 | def _make_lm(model_name: str, api_key: str, temperature: float = 0.3, model_type: str = "chat", max_tokens: int = 65536): 100 | """Create a DSPy LM via LiteLLM provider strings (e.g., 'gemini/gemini-2.5-pro-preview-03-25').""" 101 | try: 102 | return dspy.LM(model_name, api_key=api_key, temperature=temperature, model_type=model_type, max_tokens=max_tokens) 103 | except Exception: 104 | if "pro" in model_name: 105 | return dspy.LM(FALLBACK_WRITER, api_key=api_key, temperature=temperature, model_type=model_type, max_tokens=max_tokens) 106 | return dspy.LM(FALLBACK_RESEARCH, api_key=api_key, temperature=temperature, model_type=model_type, max_tokens=max_tokens) 107 | 108 | WRITER_LM = _make_lm(WRITER_MODEL, GEMINI_API_KEY, temperature=0.2) 109 | RESEARCH_LM = _make_lm(RESEARCH_MODEL, GEMINI_API_KEY, temperature=0.4) 110 | REFLECT_LM = _make_lm(REFLECTION_MODEL, GEMINI_API_KEY, temperature=0.8) 111 | 112 | dspy.configure(lm=RESEARCH_LM, cache=True) 113 | 114 | # ---------------------------- 115 | # DSPy Signatures (instructions) 116 | # ---------------------------- 117 | 118 | class QueryGenSig(dspy.Signature): 119 | """Produce 4–8 diverse Exa search queries for a section (use quoted phrases, site:, intitle:, date ranges). Return a JSON list of strings.""" 120 | section_title = dspy.InputField() 121 | section_instructions = dspy.InputField() 122 | queries_json = dspy.OutputField() 123 | 124 | class SummarizeSig(dspy.Signature): 125 | """Summarize source texts into evidence bullets for the section. 126 | OUTPUT JSON: {"bullets": ["...", "..."]}. Cite as [S#] (matching the per-query ordering). Keep bullets concise & factual. 127 | """ 128 | prompt = dspy.InputField() 129 | sources_digest = dspy.InputField() 130 | output_json = dspy.OutputField() 131 | 132 | class WriteSectionSig(dspy.Signature): 133 | """Write a polished Markdown section '# {section_title}' using [n] numeric citations only. Avoid bare URLs. Return ONLY the section Markdown.""" 134 | section_title = dspy.InputField() 135 | section_instructions = dspy.InputField() 136 | evidence_digest = dspy.InputField() 137 | output_markdown = dspy.OutputField() 138 | 139 | class GapAnalysisSig(dspy.Signature): 140 | """Given current bullets, decide if more research is needed. OUTPUT JSON: {"need_more": bool, "followup_queries": ["..."]}""" 141 | section_title = dspy.InputField() 142 | bullets_digest = dspy.InputField() 143 | output_json = dspy.OutputField() 144 | 145 | class CiteFixSig(dspy.Signature): 146 | """Fix citations: ensure only [n] numeric citations (no [S#] or raw URLs). Return ONLY the corrected Markdown body.""" 147 | markdown_body = dspy.InputField() 148 | id_map_notes = dspy.InputField() 149 | fixed_markdown = dspy.OutputField() 150 | 151 | class ReviewSig(dspy.Signature): 152 | """Review the full report for coverage, correctness, clarity, neutrality, structure, citation hygiene. OUTPUT JSON: {pass_checks, issues, suggestions, summary}""" 153 | report_md = dspy.InputField() 154 | output_json = dspy.OutputField() 155 | 156 | class ReviseSig(dspy.Signature): 157 | """Apply review suggestions to the report without adding new unsupported facts. Return the improved Markdown body (no References).""" 158 | report_md = dspy.InputField() 159 | suggestions = dspy.InputField() 160 | improved_md = dspy.OutputField() 161 | 162 | QUERY_GEN = dspy.ChainOfThought(QueryGenSig) 163 | SUMMARIZER = dspy.Predict(SummarizeSig) 164 | WRITE_SECTION = dspy.ChainOfThought(WriteSectionSig) 165 | GAP_ANALYZER = dspy.Predict(GapAnalysisSig) 166 | CITE_FIXER = dspy.Predict(CiteFixSig) 167 | REVIEWER = dspy.Predict(ReviewSig) 168 | REVISER = dspy.ChainOfThought(ReviseSig) 169 | 170 | # ---------------------------- 171 | # GEPA: fast optimization 172 | # ---------------------------- 173 | 174 | def heuristic_report_metric(gold, pred, trace=None, pred_name=None, pred_trace=None) -> float: 175 | """LLM-free shaping signal for GEPA.""" 176 | text = "" 177 | if hasattr(pred, "output_markdown"): text = pred.output_markdown or "" 178 | elif hasattr(pred, "fixed_markdown"): text = pred.fixed_markdown or "" 179 | elif hasattr(pred, "queries_json"): text = pred.queries_json or "" 180 | elif hasattr(pred, "output_json"): text = pred.output_json or "" 181 | score, notes = 0.0, [] 182 | if hasattr(pred, "queries_json"): 183 | data = safe_json_loads(text, []) 184 | uniq = len(set([q.strip().lower() for q in data if isinstance(q, str)])) 185 | has_ops = any(("site:" in (q or "").lower() or "intitle:" in (q or "").lower() or '"' in (q or "")) for q in data if isinstance(q, str)) 186 | score = 0.3*clamp(uniq/8) + 0.2*(1 if 4 <= uniq <= 10 else 0) + 0.5*(1 if has_ops else 0) 187 | if uniq < 4: notes.append("Add 6–8 diverse queries.") 188 | if not has_ops: notes.append("Use operators like site:, intitle:, \"quoted\".") 189 | elif hasattr(pred, "output_markdown"): 190 | has_h1 = bool(re.search(r"^#\s+", text, flags=re.M)) 191 | cites = len(re.findall(r"\[\d+\]", text)) 192 | urls_inline = bool(re.search(r"https?://", text)) 193 | too_short = len(text) < 700 194 | score = (0.25*(1 if has_h1 else 0) + 0.35*clamp(cites/6) + 0.15*(0 if urls_inline else 1) + 0.25*(0 if too_short else 1)) 195 | if not has_h1: notes.append("Start with H1.") 196 | if cites < 3: notes.append("Add more [n] citations.") 197 | if urls_inline: notes.append("No bare URLs in body.") 198 | if too_short: notes.append("Increase depth (>=700 chars).") 199 | elif hasattr(pred, "fixed_markdown"): 200 | leftovers = bool(re.search(r"\[S\d+\]", text)) 201 | bracket_nums = bool(re.search(r"\[\d+\]", text)) 202 | score = 0.6*(1 if bracket_nums else 0) + 0.4*(0 if leftovers else 1) 203 | if leftovers: notes.append("Replace [S#] with [n].") 204 | elif hasattr(pred, "output_json"): 205 | ok = safe_json_loads(text) is not None 206 | score = 1.0 if ok else 0.2 207 | if not ok: notes.append("Return valid JSON.") 208 | else: 209 | score = 0.5; notes.append("Improve structure.") 210 | return float(clamp(score, 0, 1)) 211 | 212 | def optimize_with_gepa() -> None: 213 | """Optimize each DSPy module with its specific training set.""" 214 | # Map each module to its training set function 215 | module_trainsets = { 216 | QUERY_GEN: trainset_query_gen(), 217 | SUMMARIZER: trainset_summarizer(), 218 | WRITE_SECTION: trainset_write_section(), 219 | GAP_ANALYZER: trainset_gap_analyzer(), 220 | CITE_FIXER: trainset_cite_fixer(), 221 | REVIEWER: trainset_reviewer(), 222 | REVISER: trainset_reviser(), 223 | } 224 | 225 | tele = GEPA(metric=heuristic_report_metric, auto="light", reflection_lm=REFLECT_LM, track_stats=False) 226 | 227 | for module, trainset in module_trainsets.items(): 228 | module_name = type(module).__name__ 229 | print(f"[GEPA] Optimizing {module_name} with {len(trainset)} examples...") 230 | try: 231 | tele.compile(student=module, trainset=trainset) 232 | print(f"[GEPA] ✓ {module_name} optimization complete") 233 | except Exception as e: 234 | print(f"[GEPA] ✗ Skipped {module_name}: {e}") 235 | 236 | # Module-specific training sets for GEPA 237 | 238 | def trainset_query_gen() -> List[dspy.Example]: 239 | """Training examples for QUERY_GEN: diverse search queries with operators.""" 240 | return [ 241 | dspy.Example( 242 | section_title="Market Size", 243 | section_instructions="TAM, SAM, SOM 2019–2025 with figures and growth rates", 244 | queries_json='["market size TAM 2024", "site:.gov industry market size", "intitle:forecast 2024..2025", "\\"total addressable market\\" 2024", "SAM SOM market sizing report", "site:statista.com market size trends"]' 245 | ).with_inputs('section_title', 'section_instructions'), 246 | dspy.Example( 247 | section_title="Technology Stack", 248 | section_instructions="Current tools, frameworks, and infrastructure with version numbers", 249 | queries_json='["site:github.com popular frameworks 2024", "intitle:\\"tech stack\\" comparison", "\\"technology adoption\\" trends 2024", "infrastructure tools benchmarks", "site:stackoverflow.com framework usage statistics", "developer survey 2024 tools"]' 250 | ).with_inputs('section_title', 'section_instructions'), 251 | dspy.Example( 252 | section_title="Regulatory Landscape", 253 | section_instructions="Key regulations, compliance requirements, and policy changes 2023-2025", 254 | queries_json='["site:.gov regulations 2024", "compliance requirements industry", "intitle:\\"policy changes\\" 2024..2025", "\\"regulatory framework\\" updates", "site:.org legal compliance guidelines", "data protection regulations 2024"]' 255 | ).with_inputs('section_title', 'section_instructions'), 256 | ] 257 | 258 | def trainset_summarizer() -> List[dspy.Example]: 259 | """Training examples for SUMMARIZER: source texts to cited bullets.""" 260 | return [ 261 | dspy.Example( 262 | prompt="Summarize for section 'Market Growth'. Cite using [S#] matching the source indices above.", 263 | sources_digest="S1 | Market Report 2024 — example.com\nThe global market grew 15% YoY in Q3 2024, reaching $500B valuation.\n\nS2 | Industry Analysis — research.org\nAdoption rates increased from 23% in 2023 to 34% in 2024, driven by enterprise demand.", 264 | output_json='{"bullets": ["Global market grew 15% YoY in Q3 2024, reaching $500B valuation [S1]", "Adoption rates increased from 23% (2023) to 34% (2024), driven by enterprise demand [S2]"]}' 265 | ).with_inputs('prompt', 'sources_digest'), 266 | dspy.Example( 267 | prompt="Summarize for section 'Key Risks'. Cite using [S#] matching the source indices above.", 268 | sources_digest="S1 | Risk Assessment Report — analyst.com\nSupply chain disruptions pose the highest risk, with 67% of companies reporting delays.\n\nS2 | Security Analysis — security.org\nCybersecurity incidents increased 45% in H1 2024 compared to 2023.", 269 | output_json='{"bullets": ["Supply chain disruptions pose highest risk, with 67% of companies reporting delays [S1]", "Cybersecurity incidents increased 45% in H1 2024 vs 2023 [S2]"]}' 270 | ).with_inputs('prompt', 'sources_digest'), 271 | ] 272 | 273 | def trainset_write_section() -> List[dspy.Example]: 274 | """Training examples for WRITE_SECTION: evidence to markdown with citations.""" 275 | return [ 276 | dspy.Example( 277 | section_title="Market Trends", 278 | section_instructions="2024-2025 trends with specific data points and citations", 279 | evidence_digest="Evidence for 'Market Trends':\n- Cloud adoption growing 25% annually [1]\n- AI integration became standard in 78% of new products [2]\n- Mobile-first approach now 89% of development strategies [3]", 280 | output_markdown='# Market Trends\n\nThe market has experienced significant transformation in 2024-2025. Cloud adoption continues to accelerate at 25% annual growth [1], fundamentally changing infrastructure strategies across enterprises. Artificial intelligence has moved from experimental to essential, with 78% of new products now integrating AI capabilities as a standard feature [2].\n\nDevelopment approaches have also shifted dramatically toward mobile-first strategies, with 89% of teams now prioritizing mobile experiences in their product roadmaps [3]. This represents a maturation of mobile technology from an afterthought to the primary delivery platform for most digital services.' 281 | ).with_inputs('section_title', 'section_instructions', 'evidence_digest'), 282 | dspy.Example( 283 | section_title="Competitive Landscape", 284 | section_instructions="Top 5 players with market share and differentiators", 285 | evidence_digest="Evidence for 'Competitive Landscape':\n- Company A leads with 32% market share, strong in enterprise [1]\n- Company B holds 24%, focused on SMB segment [2]\n- Company C at 18%, competing on price and features [3]\n- Companies D and E combined hold 15%, niche players [4]", 286 | output_markdown='# Competitive Landscape\n\nThe market is led by Company A with a commanding 32% market share, leveraging strong enterprise relationships and established trust [1]. Company B follows with 24% market share, having carved out a successful position in the SMB segment through targeted product features and pricing [2].\n\nCompany C holds 18% of the market, competing aggressively on both price and feature completeness [3]. The remaining market is fragmented among smaller players, with Companies D and E combining for 15% share, each serving specific niche segments [4].' 287 | ).with_inputs('section_title', 'section_instructions', 'evidence_digest'), 288 | ] 289 | 290 | def trainset_gap_analyzer() -> List[dspy.Example]: 291 | """Training examples for GAP_ANALYZER: decide if more research needed.""" 292 | return [ 293 | dspy.Example( 294 | section_title="Market Size", 295 | bullets_digest="- Market size $500B in 2024\n- Growing 15% YoY\n- North America 45% share\n- Asia Pacific 30% share\n- Europe 20% share\n- Forecast to reach $850B by 2027", 296 | output_json='{"need_more": false, "followup_queries": []}' 297 | ).with_inputs('section_title', 'bullets_digest'), 298 | dspy.Example( 299 | section_title="Technology Stack", 300 | bullets_digest="- React most popular framework\n- Python growing in backend", 301 | output_json='{"need_more": true, "followup_queries": ["infrastructure technologies 2024", "database trends and adoption", "cloud platform market share", "DevOps tool usage statistics"]}' 302 | ).with_inputs('section_title', 'bullets_digest'), 303 | dspy.Example( 304 | section_title="Risks", 305 | bullets_digest="No bullets yet.", 306 | output_json='{"need_more": true, "followup_queries": ["industry risks 2024", "compliance challenges", "supply chain vulnerabilities", "cybersecurity threats report", "market disruption factors"]}' 307 | ).with_inputs('section_title', 'bullets_digest'), 308 | ] 309 | 310 | def trainset_cite_fixer() -> List[dspy.Example]: 311 | """Training examples for CITE_FIXER: convert [S#] to [n].""" 312 | return [ 313 | dspy.Example( 314 | markdown_body="# Findings\n\nResearch shows significant growth [S1] with adoption increasing [S2]. The trend continues [S1] into 2025.", 315 | id_map_notes="1 -> https://example.com/report\n2 -> https://research.org/study", 316 | fixed_markdown="# Findings\n\nResearch shows significant growth [1] with adoption increasing [2]. The trend continues [1] into 2025." 317 | ).with_inputs('markdown_body', 'id_map_notes'), 318 | dspy.Example( 319 | markdown_body="# Market Analysis\n\nThe market leader [S3] holds 40% share, while challenger [S1] has 25%. Recent data [S2] confirms this distribution [S3].", 320 | id_map_notes="1 -> https://news.com/article\n2 -> https://data.org/stats\n3 -> https://market.com/report", 321 | fixed_markdown="# Market Analysis\n\nThe market leader [3] holds 40% share, while challenger [1] has 25%. Recent data [2] confirms this distribution [3]." 322 | ).with_inputs('markdown_body', 'id_map_notes'), 323 | ] 324 | 325 | def trainset_reviewer() -> List[dspy.Example]: 326 | """Training examples for REVIEWER: review reports for quality.""" 327 | return [ 328 | dspy.Example( 329 | report_md="# Executive Summary\n\nThis report analyzes market trends based on comprehensive research [1][2][3].\n\n# Market Size\n\nThe global market reached $500B in 2024, growing 15% YoY [1]. Regional distribution shows North America at 45%, Asia Pacific 30%, and Europe 20% [2].\n\n## References\n[1] Market Report 2024 — example.com. https://example.com/report\n[2] Regional Analysis — research.org. https://research.org/study\n[3] Industry Forecast — analyst.com. https://analyst.com/forecast", 330 | output_json='{"pass_checks": true, "issues": [], "suggestions": ["Consider adding specific date ranges for projections", "Could expand on Asia Pacific growth drivers"], "summary": "Well-structured report with good citation coverage and clear regional breakdown."}' 331 | ).with_inputs('report_md'), 332 | dspy.Example( 333 | report_md="# Market Overview\n\nThe market is growing fast. Many companies are entering the space. https://example.com shows good data.\n\n## References", 334 | output_json='{"pass_checks": false, "issues": ["No specific data or citations in body", "Bare URL in prose instead of numeric citation", "Empty references section", "Vague language without concrete facts"], "suggestions": ["Add specific growth percentages with citations", "Replace bare URL with [n] citation and add to references", "Include concrete company names and market shares", "Provide time-specific data points"], "summary": "Report lacks specificity, proper citations, and concrete data. Needs major revision."}' 335 | ).with_inputs('report_md'), 336 | ] 337 | 338 | def trainset_reviser() -> List[dspy.Example]: 339 | """Training examples for REVISER: improve reports based on feedback.""" 340 | return [ 341 | dspy.Example( 342 | report_md="# Market Size\n\nThe market is big and growing. Many companies participate.\n\n## References\n[1] Market Report — example.com. https://example.com", 343 | suggestions="- Add specific market size figures with citations\n- Include growth percentages and timeframes\n- Specify key player names", 344 | improved_md="# Market Size\n\nThe global market reached $500 billion in 2024, representing 15% year-over-year growth [1]. The market is characterized by strong participation from both established enterprises and emerging startups, with the top 5 players accounting for 68% of total market share [1]." 345 | ).with_inputs('report_md', 'suggestions'), 346 | dspy.Example( 347 | report_md="# Technology Trends\n\nAI is popular. Cloud computing is used a lot. https://tech.com has more info.\n\n## References\n[1] Tech Report — tech.com. https://tech.com", 348 | suggestions="- Replace bare URL with proper citation\n- Add specific adoption percentages\n- Include concrete technology names", 349 | improved_md="# Technology Trends\n\nArtificial intelligence adoption has reached 78% among enterprise software products in 2024 [1]. Cloud infrastructure utilization continues to grow, with 89% of organizations now operating hybrid or multi-cloud environments [1]. Key technologies driving this transformation include transformer-based language models, containerization platforms like Kubernetes, and serverless computing architectures [1]." 350 | ).with_inputs('report_md', 'suggestions'), 351 | ] 352 | 353 | # ---------------------------- 354 | # Exa search + contents 355 | # ---------------------------- 356 | 357 | async def exa_search_and_contents(query: str, k: int) -> List[SourceDoc]: 358 | """ 359 | Use Exa's search_and_contents to get top-k results with full text. 360 | Python SDK fields: result.url, result.title, result.published_date, result.text. (See SDK spec.) 361 | """ 362 | def _call(): 363 | # You can add: category="news"/"research paper", start_published_date=..., type="auto" 364 | return EXA.search_and_contents(query, text=True, num_results=k) 365 | try: 366 | resp = await asyncio.to_thread(_call) 367 | except Exception as e: 368 | print(f"[Exa] search failed for '{query}': {e}") 369 | return [] 370 | docs: List[SourceDoc] = [] 371 | for r in getattr(resp, "results", []) or []: 372 | text = (getattr(r, "text", None) or "")[:MAX_CONTENT_CHARS_PER_SOURCE] 373 | if not text or len(text) < 200: 374 | continue 375 | url = getattr(r, "url", None) or getattr(r, "id", None) 376 | title = getattr(r, "title", None) 377 | pub = getattr(r, "published_date", None) 378 | # Normalize published to ISO date if possible 379 | if pub: 380 | try: pub = dtparse.parse(pub).date().isoformat() 381 | except Exception: pass 382 | docs.append(SourceDoc(url=url, title=title, site=short_host(url), published=pub, content=text)) 383 | return docs 384 | 385 | # ---------------------------- 386 | # Citation registry 387 | # ---------------------------- 388 | 389 | class CitationRegistry: 390 | def __init__(self): self.url_to_id: Dict[str, int] = {}; self.ordered: List[str] = [] 391 | def assign(self, url: str) -> int: 392 | if url not in self.url_to_id: 393 | self.url_to_id[url] = len(self.ordered) + 1 394 | self.ordered.append(url) 395 | return self.url_to_id[url] 396 | def references_markdown(self, url_to_doc: Dict[str, SourceDoc]) -> str: 397 | lines = ["## References"] 398 | for u in self.ordered: 399 | idx = self.url_to_id[u]; doc = url_to_doc.get(u) or SourceDoc(url=u) 400 | label = doc.title or u; site = f" — {doc.site}" if doc.site else "" 401 | dt = f" (published {doc.published})" if doc.published else "" 402 | lines.append(f"[{idx}] {label}{site}{dt}. {u}") 403 | return "\n".join(lines) 404 | 405 | # ---------------------------- 406 | # Graph state 407 | # ---------------------------- 408 | 409 | class GraphState(TypedDict): 410 | topic: str 411 | sections: List[SectionSpec] 412 | round: int 413 | queries: Annotated[List[Dict[str, str]], operator.add] # [{"section","query"}] 414 | research: Annotated[List[ResearchSummary], operator.add] # append 415 | drafts: Annotated[Dict[str, str], operator.or_] # {section: markdown} 416 | cite_maps: Annotated[Dict[str, Dict[int, str]], operator.or_] # {section: {local_num: url}} 417 | used_urls: Annotated[List[str], operator.add] # optional 418 | report_md: Optional[str] 419 | references_md: Optional[str] 420 | eval_result: Optional[EvalResult] 421 | 422 | # ---------------------------- 423 | # Nodes (agents) 424 | # ---------------------------- 425 | 426 | def plan_queries(state: GraphState) -> GraphState: 427 | print("\n" + "="*80) 428 | print(f"[QUERY] Planning search queries for {len(state['sections'])} sections...") 429 | print("="*80) 430 | 431 | new_queries: List[Dict[str, str]] = [] 432 | with dspy.context(lm=WRITER_LM): 433 | for sec in state["sections"]: 434 | q = QUERY_GEN(section_title=sec.name, section_instructions=sec.instructions) 435 | data = safe_json_loads(q.queries_json, []) 436 | uniq, seen = [], set() 437 | for s in data: 438 | if isinstance(s, str): 439 | s2 = s.strip() 440 | if s2 and s2.lower() not in seen: 441 | uniq.append(s2); seen.add(s2.lower()) 442 | if len(uniq) >= 8: break 443 | if not uniq: 444 | uniq = [f'{sec.name} overview', f'{sec.name} trends 2024..2025', f'"{sec.name}" case studies', f'intitle:{sec.name} report PDF', f'site:.gov {sec.name}', f'site:.org {sec.name}'] 445 | print(f"[QUERY] {sec.name}: {len(uniq)} queries generated") 446 | for u in uniq[:8]: 447 | new_queries.append({"section": sec.name, "query": u}) 448 | 449 | print(f"[QUERY] Total queries planned: {len(new_queries)}\n") 450 | return {"queries": new_queries} 451 | 452 | def route_queries(state: GraphState): 453 | return [Send("search_node", {"section": item["section"], "query": item["query"]}) 454 | for item in state.get("queries", [])] 455 | 456 | async def search_node(state: GraphState) -> GraphState: 457 | """Exa search + contents + summarize (Flash).""" 458 | section, query = state["section"], state["query"] 459 | print(f"[SEARCH] '{section}': {query[:60]}{'...' if len(query) > 60 else ''}") 460 | 461 | docs = await exa_search_and_contents(query, k=SEARCH_RESULTS_PER_QUERY) 462 | print(f"[SEARCH] → Found {len(docs)} documents") 463 | 464 | # Build digest for LLM summarization: S1,S2,... per query 465 | pieces = [] 466 | for i, d in enumerate(docs, start=1): 467 | excerpt = (d.content or "")[:2000] 468 | pieces.append(f"S{i} | {d.title or d.url} — {d.site or ''}\n{excerpt}") 469 | sources_digest = "\n\n".join(pieces) if pieces else "NO_SOURCES" 470 | prompt = f"Summarize for section '{section}'. Cite using [S#] matching the source indices above." 471 | 472 | with dspy.context(lm=RESEARCH_LM): 473 | out = SUMMARIZER(prompt=prompt, sources_digest=sources_digest) 474 | js = safe_json_loads(out.output_json, {}) or {} 475 | bullets = js.get("bullets", []) 476 | 477 | print(f"[SEARCH] → Extracted {len(bullets)} evidence bullets\n") 478 | return {"research": [ResearchSummary(section=section, query=query, bullets=bullets, sources=docs)]} 479 | 480 | def merge_and_gap_analyze(state: GraphState) -> GraphState: 481 | print("\n" + "="*80) 482 | print(f"[GAP] Analyzing research coverage (Round {state['round'] + 1}/{MAX_ROUNDS})...") 483 | print("="*80) 484 | 485 | sec_to_bullets: Dict[str, List[str]] = {} 486 | for rs in state.get("research", []): 487 | sec_to_bullets.setdefault(rs.section, []).extend(rs.bullets or []) 488 | 489 | followups: List[Dict[str, str]] = [] 490 | with dspy.context(lm=WRITER_LM): 491 | for sec in state["sections"]: 492 | bullets = sec_to_bullets.get(sec.name, []) 493 | digest = "\n".join(f"- {b}" for b in bullets[:50]) if bullets else "No bullets yet." 494 | print(f"[GAP] '{sec.name}': {len(bullets)} bullets collected") 495 | 496 | ga = GAP_ANALYZER(section_title=sec.name, bullets_digest=digest) 497 | j = safe_json_loads(ga.output_json, {}) or {} 498 | if j.get("need_more") and isinstance(j.get("followup_queries"), list): 499 | new_queries = [q for q in j["followup_queries"][:5] if isinstance(q, str) and q.strip()] 500 | if new_queries: 501 | print(f"[GAP] → Needs {len(new_queries)} more queries") 502 | for q in new_queries: 503 | followups.append({"section": sec.name, "query": q.strip()}) 504 | 505 | if followups and state["round"] + 1 < MAX_ROUNDS: 506 | print(f"\n[GAP] DECISION: Continue research with {len(followups)} followup queries\n") 507 | return {"queries": followups, "round": state["round"] + 1} 508 | 509 | print("\n[GAP] DECISION: Research complete, proceeding to writing\n") 510 | return {"queries": []} # clear queries and proceed to writing 511 | 512 | def route_or_write(state: GraphState): 513 | if any(state.get("queries", [])) and state["round"] > 0: 514 | return route_queries(state) # more research 515 | # else: write each section in parallel 516 | return [Send("write_section_node", { 517 | "section": s.name, 518 | "sections": state["sections"], 519 | "research": state.get("research", []) 520 | }) for s in state["sections"]] 521 | 522 | def _build_evidence_digest(section: str, research: List[ResearchSummary]) -> Tuple[str, Dict[int, str]]: 523 | """ 524 | Merge bullets across queries and create a local S#->url mapping for the writer pass. 525 | We number S1..Sk per research summary (query) and rewrite bullets remains [S#] (writer learns mapping). 526 | """ 527 | lines = [f"Evidence for '{section}':"] 528 | s_to_url_global: Dict[int, str] = {} 529 | next_num = 1 530 | for rs in research: 531 | if rs.section != section: continue 532 | # map local S# for this query block to absolute local numbers for the section 533 | local_map = {} 534 | for d in rs.sources: 535 | local_map[f"S{len(local_map)+1}"] = d.url 536 | for b in rs.bullets: 537 | bb = b 538 | for s_id, url in local_map.items(): 539 | # assign a stable number for this section for each url 540 | if url not in s_to_url_global.values(): 541 | s_to_url_global[next_num] = url; assigned = next_num; next_num += 1 542 | else: 543 | # find existing number for this url 544 | assigned = [k for k,v in s_to_url_global.items() if v == url][0] 545 | bb = re.sub(rf"\[{s_id}\]", f"[{assigned}]", bb) 546 | lines.append(f"- {bb}") 547 | return "\n".join(lines), s_to_url_global # evidence digest, local map num->url 548 | 549 | def write_section_node(state: GraphState) -> GraphState: 550 | section = state["section"] 551 | sec_spec = next((s for s in state["sections"] if s.name == section), None) 552 | if not sec_spec: return {} 553 | 554 | print(f"[WRITE] Drafting section: '{section}'") 555 | 556 | edigest, local_num_to_url = _build_evidence_digest(section, state.get("research", [])) 557 | 558 | with dspy.context(lm=WRITER_LM): 559 | w = WRITE_SECTION(section_title=sec_spec.name, section_instructions=sec_spec.instructions, evidence_digest=edigest) 560 | md = w.output_markdown or f"# {section}\n\n*(No content was generated.)*\n" 561 | 562 | with dspy.context(lm=WRITER_LM): 563 | fixed = CITE_FIXER(markdown_body=md, id_map_notes="\n".join(f"{k} -> {v}" for k, v in local_num_to_url.items())) 564 | md2 = fixed.fixed_markdown or md 565 | 566 | used_ids = sorted(set(int(x) for x in re.findall(r"\[(\d+)\]", md2))) 567 | urls = [local_num_to_url.get(i) for i in used_ids if i in local_num_to_url] 568 | 569 | char_count = len(md2) 570 | cite_count = len(used_ids) 571 | print(f"[WRITE] → {char_count} chars, {cite_count} citations\n") 572 | 573 | # store section draft, local citation map (for final global renumber), and used urls 574 | return { 575 | "drafts": {section: md2}, 576 | "cite_maps": {section: local_num_to_url}, 577 | "used_urls": [u for u in urls if u] 578 | } 579 | 580 | def assemble_and_review(state: GraphState) -> GraphState: 581 | print("\n" + "="*80) 582 | print("[REVIEW] Assembling and reviewing final report...") 583 | print("="*80) 584 | 585 | order = [s.name for s in state["sections"]] 586 | global_reg = CitationRegistry() 587 | url_to_doc: Dict[str, SourceDoc] = {} 588 | 589 | # Build a metadata table for all discovered sources 590 | for rs in state.get("research", []): 591 | for d in rs.sources: 592 | url_to_doc[d.url] = d 593 | 594 | print(f"[REVIEW] Assembling {len(order)} sections...") 595 | 596 | # Renumber citations globally in order of their first appearance across sections 597 | def renumber_section(md: str, map_local: Dict[int, str]) -> str: 598 | def _repl(m): 599 | old_num = int(m.group(1)) 600 | url = map_local.get(old_num) 601 | if not url: return m.group(0) 602 | new_num = global_reg.assign(url) 603 | return f"[{new_num}]" 604 | # only replace inside the body (no references exist yet) 605 | return re.sub(r"\[(\d+)\]", _repl, md) 606 | 607 | parts = [] 608 | for sec in order: 609 | body = state["drafts"].get(sec, "") 610 | local_map = state.get("cite_maps", {}).get(sec, {}) 611 | parts.append(renumber_section(body, local_map)) 612 | body_renumbered = "\n\n".join(parts).strip() 613 | 614 | refs = global_reg.references_markdown(url_to_doc) 615 | full_md = f"{body_renumbered}\n\n{refs}" 616 | 617 | total_citations = len(global_reg.ordered) 618 | total_chars = len(full_md) 619 | print(f"[REVIEW] → {total_chars} chars, {total_citations} unique sources") 620 | 621 | # Review & optional revise 622 | print("[REVIEW] Running quality review...") 623 | with dspy.context(lm=WRITER_LM): 624 | rv = REVIEWER(report_md=full_md) 625 | rj = safe_json_loads(rv.output_json, {}) or {} 626 | 627 | pass_checks = rj.get("pass_checks", False) 628 | issues = rj.get("issues", []) 629 | suggestions = rj.get("suggestions", []) 630 | 631 | print(f"[REVIEW] → Pass: {pass_checks}, Issues: {len(issues)}, Suggestions: {len(suggestions)}") 632 | 633 | if not pass_checks and suggestions: 634 | print(f"[REVIEW] Applying {len(suggestions)} revision suggestions...") 635 | with dspy.context(lm=WRITER_LM): 636 | rev = REVISER(report_md=full_md, suggestions="\n".join(f"- {s}" for s in suggestions)) 637 | full_md = (rev.improved_md or body_renumbered).strip() + "\n\n" + refs 638 | print("[REVIEW] → Revision complete") 639 | 640 | print() 641 | return {"report_md": full_md, "references_md": refs} 642 | 643 | # ---------------------------- 644 | # Evaluation 645 | # ---------------------------- 646 | 647 | DEFAULT_EVAL_QUESTIONS = [ 648 | "Does each section follow the instructions and include concrete facts?", 649 | "Are all nontrivial claims cited with [n] and do references look reputable?", 650 | "Is the structure clear with helpful headings/subheadings?", 651 | "Are there explicit dates for time-sensitive facts?", 652 | "Are there at least 2–3 sources per major section?", 653 | "Are URLs omitted from the prose (only numeric citations)?", 654 | "Is there any hallucination smell?", 655 | ] 656 | 657 | def eval_report_simple(md: str) -> EvalResult: 658 | checks = {} 659 | checks["has_h1"] = 1.0 if re.search(r"^#\s+", md, flags=re.M) else 0.0 660 | cites = len(re.findall(r"\[\d+\]", md)) 661 | checks["enough_cites"] = clamp(cites/10) 662 | checks["no_raw_urls"] = 1.0 if not re.search(r"https?://", md.split("## References")[0]) else 0.0 663 | checks["has_refs"] = 1.0 if "## References" in md else 0.0 664 | checks["length_ok"] = 1.0 if len(md) >= 2000 else 0.4 if len(md) >= 1200 else 0.1 665 | score = sum(checks.values())/len(checks) 666 | return EvalResult(score=score, breakdown=checks, notes=f"{cites} inline citations; {len(md)} chars.") 667 | 668 | # ---------------------------- 669 | # Build graph 670 | # ---------------------------- 671 | 672 | def build_graph() -> Any: 673 | graph = StateGraph(GraphState) 674 | graph.add_node("plan_queries", plan_queries) 675 | graph.add_node("search_node", search_node) # async 676 | graph.add_node("merge_and_gap_analyze", merge_and_gap_analyze) 677 | graph.add_node("write_section_node", write_section_node) 678 | graph.add_node("assemble_and_review", assemble_and_review) 679 | 680 | graph.add_edge(START, "plan_queries") 681 | graph.add_conditional_edges("plan_queries", route_queries, ["search_node"]) 682 | graph.add_edge("search_node", "merge_and_gap_analyze") 683 | graph.add_conditional_edges("merge_and_gap_analyze", route_or_write, ["search_node", "write_section_node"]) 684 | graph.add_edge("write_section_node", "assemble_and_review") 685 | graph.add_edge("assemble_and_review", END) 686 | return graph.compile() 687 | 688 | # ---------------------------- 689 | # Run end-to-end 690 | # ---------------------------- 691 | 692 | async def run_pipeline(topic: str, sections: List[SectionSpec], optimization: bool = False) -> Dict[str, Any]: 693 | print("\n" + "="*80) 694 | print(f"RESEARCH PIPELINE: {topic}") 695 | print("="*80) 696 | print(f"Sections: {', '.join([s.name for s in sections])}") 697 | print("="*80 + "\n") 698 | 699 | # GEPA optimization with module-specific training sets 700 | if optimization: 701 | print("[GEPA] Starting prompt optimization...") 702 | optimize_with_gepa() 703 | print("[GEPA] Optimization complete!\n") 704 | 705 | print("[PIPELINE] Building research graph...") 706 | app = build_graph() 707 | initial_state: GraphState = { 708 | "topic": topic, 709 | "sections": sections, 710 | "round": 0, 711 | "queries": [], 712 | "research": [], 713 | "drafts": {}, 714 | "cite_maps": {}, 715 | "used_urls": [], 716 | "report_md": None, 717 | "references_md": None, 718 | "eval_result": None, 719 | } 720 | 721 | print("[PIPELINE] Executing multi-agent research workflow...\n") 722 | final_state: GraphState = await app.ainvoke(initial_state) 723 | 724 | print("\n" + "="*80) 725 | print("[PIPELINE] Evaluating report quality...") 726 | print("="*80) 727 | 728 | # Evaluate 729 | md = final_state.get("report_md") or "" 730 | final_state["eval_result"] = eval_report_simple(md) 731 | 732 | # Save 733 | print("[PIPELINE] Saving report to ./report.md") 734 | with open("report.md", "w", encoding="utf-8") as f: 735 | f.write(md) 736 | 737 | print("[PIPELINE] ✓ Pipeline complete!\n") 738 | return final_state 739 | 740 | # ---------------------------- 741 | # Example usage 742 | # ---------------------------- 743 | 744 | SECTIONS = [ 745 | SectionSpec( 746 | name="Executive Summary", 747 | instructions="In 180–250 words, summarize the most decision-relevant takeaways. No citations here unless needed for key numbers." 748 | ), 749 | SectionSpec( 750 | name="Market Landscape", 751 | instructions="Define the space; 2023–2025 trends; include 4+ specific figures with sources." 752 | ), 753 | SectionSpec( 754 | name="Key Players & Differentiation", 755 | instructions="Compare 5–7 players; list 1–2 distinctive capabilities each; add 2–3 objective benchmarks with citations." 756 | ), 757 | SectionSpec( 758 | name="Risks & Open Questions", 759 | instructions="Top risks, unknowns, and watch items; cite evidence; use bullets." 760 | ), 761 | SectionSpec( 762 | name="Outlook (12–24 months)", 763 | instructions="3–5 grounded predictions with supporting evidence and explicit dates; include leading indicators to track." 764 | ), 765 | ] 766 | 767 | if __name__ == "__main__": 768 | topic = "State of Edge AI Acceleration (2024–2025)" 769 | try: 770 | final = asyncio.run(run_pipeline( 771 | topic=topic, 772 | sections=SECTIONS, 773 | # optimization=True # Uncomment to enable GEPA optimization 774 | )) 775 | except Exception as e: 776 | print("Pipeline failed:", e) 777 | raise 778 | 779 | print("\n" + "="*88) 780 | print("FINAL MARKDOWN (also saved to ./report.md):") 781 | print("="*88 + "\n") 782 | print(final.get("report_md", "")) 783 | 784 | ev: EvalResult = final.get("eval_result") or EvalResult(score=0.0, breakdown={}, notes="") 785 | print("\n" + "-"*88) 786 | print("EVALUATION (quick heuristic):") 787 | print("-"*88) 788 | print(f"Score: {ev.score:.2f}") 789 | print("Breakdown:", json.dumps(ev.breakdown, indent=2)) 790 | print("Notes:", ev.notes) 791 | --------------------------------------------------------------------------------