├── .gitignore
├── LICENSE
├── README.md
├── benchmark.py
├── deepseek_aider_evaluator.py
├── dynamic_fewshot_routing.py
├── jeopardy_dataset.py
├── math_calculator.py
├── math_calculator_optimizer.py
├── math_dataset_generator.py
├── math_evaluator.py
├── math_multiplication_optimizer.py
├── reasoning_pipeline.py
├── researcher.py
├── researcher_optimizer.py
├── residual_pipeline.py
├── residual_pipeline_optimizer.py
├── serper_search.py
├── signatures.py
└── tree_of_thoughts.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | .aider*
173 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Tom Dörr
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">Reasoning Pipeline Experiments</h1>
  2 | 
  3 | <p align="center">
  4 |   <strong>Exploring AI reasoning capabilities using DSPy</strong>
  5 | </p>
  6 | 
  7 | <p align="center">
  8 |   <a href="https://github.com/tom-doerr/dspy_reasoning/blob/main/LICENSE">
  9 |     <img src="https://img.shields.io/badge/License-MIT-blue?style=flat-square&logo=opensourceinitiative&logoColor=white" alt="License">
 10 |   </a>
 11 |   <a href="https://github.com/tom-doerr/dspy_reasoning/issues">
 12 |     <img src="https://img.shields.io/github/issues/tom-doerr/dspy_reasoning?style=flat-square&logo=github&logoColor=white" alt="Issues">
 13 |   </a>
 14 |   <a href="https://github.com/tom-doerr/dspy_reasoning/pulls">
 15 |     <img src="https://img.shields.io/github/issues-pr/tom-doerr/dspy_reasoning?style=flat-square&logo=github&logoColor=white" alt="Pull Requests">
 16 |   </a>
 17 |   <a href="https://github.com/tom-doerr/dspy_reasoning">
 18 |     <img src="https://img.shields.io/github/stars/tom-doerr/dspy_reasoning?style=flat-square&logo=github&logoColor=white" alt="Stars">
 19 |   </a>
 20 |   <a href="https://github.com/tom-doerr/dspy_reasoning/commits/main">
 21 |     <img src="https://img.shields.io/github/last-commit/tom-doerr/dspy_reasoning?style=flat-square&logo=github&logoColor=white" alt="Last Commit">
 22 |   </a>
 23 | </p>
 24 | 
 25 | <p align="center">
 26 |   This project explores how DSPy can be used to implement and analyze AI reasoning processes.
 27 |   It's a work in progress for experimenting with different reasoning approaches and patterns.
 28 | </p>
 29 | 
 30 | <div align="center">
 31 |   <a href="#what-it-does">What it does</a> •
 32 |   <a href="#installation">Installation</a> •
 33 |   <a href="#usage">Usage</a> •
 34 |   <a href="#contributing">Contributing</a> •
 35 |   <a href="#license">License</a>
 36 | </div>
 37 | 
 38 | ## What it does
 39 | 
 40 | - Implements iterative reasoning processes
 41 | - Analyzes reasoning patterns and logical validity
 42 | - Tracks reasoning performance metrics
 43 | - Provides detailed reasoning analysis
 44 | 
 45 | ## Current Limitations
 46 | 
 47 | This is an experimental project with several known limitations:
 48 | 
 49 | - Reasoning quality depends heavily on the underlying model
 50 | - Analysis capabilities are still basic
 51 | - Performance metrics are simple
 52 | - Needs more diverse test cases
 53 | 
 54 | ## System Components
 55 | 
 56 | ### 1. Math Multiplication Optimizer
 57 | - Implements multiplication solver using DSPy Chain-of-Thought
 58 | - Uses MIPROv2 for optimization
 59 | - Generates random multiplication problems for training
 60 | - Evaluates accuracy on validation set
 61 | - Example usage:
 62 |   ```bash
 63 |   python3 math_multiplication_optimizer.py
 64 |   ```
 65 | 
 66 | ### 2. Residual Pipeline Optimizer
 67 | - Optimizes search-replace pipelines using BootstrapFewShot or MIPROv2
 68 | - Supports both standard and iterative pipeline types
 69 | - Tracks optimization history and best configurations
 70 | - Example usage:
 71 |   ```bash
 72 |   python3 residual_pipeline_optimizer.py --pipeline-type standard --optimizer mipro
 73 |   ```
 74 | 
 75 | ### 3. Jeopardy Dataset Generator
 76 | Generates challenging Jeopardy-style questions across multiple categories:
 77 | - Creates initial questions and hints
 78 | - Produces more challenging final questions
 79 | - Saves dataset to `jeopardy_dataset.json`
 80 | 
 81 | ### 2. Reasoning Pipeline
 82 | Implements iterative reasoning with:
 83 | - Context tracking and history
 84 | - Objective achievement analysis
 85 | - Formal logical fallacy detection:
 86 |   - Affirming the consequent
 87 |   - Denying the antecedent
 88 |   - Undistributed middle
 89 |   - Illicit major/minor
 90 | - Mathematical validation
 91 | - Termination logic
 92 | 
 93 | ### 3. Benchmark System
 94 | Measures pipeline performance by:
 95 | - Running reasoning pipeline on generated questions
 96 | - Verifying answers using semantic matching
 97 | - Tracking metrics:
 98 |   - Accuracy
 99 |   - Iterations per question
100 |   - Processing time
101 |   - Fallacy detection rates
102 | 
103 | ## Installation
104 | 
105 | 1. Clone the repository:
106 |    ```bash
107 |    git clone https://github.com/tom-doerr/dspy_reasoning.git
108 |    cd dspy_reasoning
109 |    ```
110 | 
111 | 2. Install dependencies:
112 |    ```bash
113 |    pip install -r requirements.txt
114 |    ```
115 | 
116 | 3. Configure environment variables:
117 |    ```bash
118 |    export DSPY_MODEL=deepseek/deepseek-chat
119 |    ```
120 | 
121 | 4. (Optional) Install development dependencies:
122 |    ```bash
123 |    pip install -r requirements-dev.txt
124 |    ```
125 | 
126 | ## Usage
127 | 
128 | 1. Generate Jeopardy questions:
129 |    ```bash
130 |    ./jeopardy_dataset.py -n 50
131 |    ```
132 | 
133 | 2. Run reasoning pipeline benchmark:
134 |    ```bash
135 |    ./benchmark.py
136 |    ```
137 | 
138 | 3. View results in `reasoning_benchmark.json`
139 | 
140 | ## Configuration
141 | 
142 | Customize settings in the scripts:
143 | - `jeopardy_dataset.py`: Adjust categories and question count
144 | - `reasoning_pipeline.py`: Modify reasoning parameters
145 | - `benchmark.py`: Change evaluation metrics
146 | 
147 | ## Performance Metrics
148 | 
149 | The system tracks comprehensive performance metrics across material batches:
150 | 
151 | - **Batch Processing Time**: Average time per batch
152 | - **Batch Accuracy**: Percentage of correct solutions per batch
153 | - **Iteration Efficiency**: Average reasoning iterations per problem
154 | - **Fallacy Detection Rate**: Percentage of detected logical fallacies
155 | - **Objective Achievement**: Success rate in meeting problem objectives
156 | 
157 | Benchmark results include aggregate statistics across all batches:
158 | - Overall accuracy
159 | - Average iterations per question
160 | - Total processing time
161 | - Fallacy detection rates
162 | - Objective achievement scores
163 | 
164 | ## Contributing
165 | 
166 | Contributions are welcome! Please follow these steps:
167 | 
168 | 1. Fork the repository
169 | 2. Create a feature branch (`git checkout -b feature/AmazingFeature`)
170 | 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
171 | 4. Push to the branch (`git push origin feature/AmazingFeature`)
172 | 5. Open a pull request
173 | 
174 | Please make sure to update tests as appropriate and follow the coding style of the project.
175 | 
176 | ## License
177 | 
178 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
179 | 


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import time
 3 | import json
 4 | import dspy
 5 | from jeopardy_dataset import JeopardyDatasetGenerator
 6 | from reasoning_pipeline import run_reasoning_pipeline
 7 | 
 8 | class VerifyAnswerSignature(dspy.Signature):
 9 |     predicted_answer = dspy.InputField(desc="The answer predicted by the model")
10 |     correct_answer = dspy.InputField(desc="The known correct answer")
11 |     verification = dspy.OutputField(desc="True if answers match semantically, False otherwise")
12 | 
13 | class AnswerVerifier(dspy.Module):
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.verify = dspy.ChainOfThought(VerifyAnswerSignature)
17 | 
18 |     def forward(self, predicted_answer, correct_answer):
19 |         return self.verify(predicted_answer=predicted_answer, correct_answer=correct_answer)
20 | 
21 | def verify_answer_match(predicted_answer, correct_answer):
22 |     """Check if the predicted answer matches the correct answer using semantic verification"""
23 |     verifier = AnswerVerifier()
24 |     result = verifier(predicted_answer, correct_answer)
25 |     return result.verification.lower().strip() in ["true", "yes", "correct"]
26 | 
27 | def benchmark_reasoning_pipeline():
28 |     print("Benchmarking Reasoning Pipeline Performance...")
29 |     
30 |     # Load generated dataset
31 |     with open("jeopardy_dataset.json") as f:
32 |         dataset = json.load(f)
33 |     
34 |     # Track pipeline performance metrics
35 |     pipeline_metrics = {
36 |         "total_questions": len(dataset),
37 |         "total_iterations": 0,
38 |         "correct_answers": 0,
39 |         "time_seconds": 0
40 |     }
41 |     
42 |     start_time = time.time()
43 |     for i, item in enumerate(dataset, 1):
44 |         print(f"\nTesting Pipeline on Question {i}/{len(dataset)}")
45 |         # Run reasoning pipeline directly
46 |         context = f"""
47 |         Final Question: {item["question"]}
48 |         Hint: {item["hint"]}
49 |         Answer: {item["answer"]}
50 |         """
51 |         objective = "Determine the correct answer to the question using the provided hint"
52 |         
53 |         reasoning_output = []
54 |         def capture_reasoning(iteration, context, objective, result):
55 |             reasoning_output.append({
56 |                 "iteration": iteration,
57 |                 "context": context,
58 |                 "objective": objective,
59 |                 "result": result
60 |             })
61 |         
62 |         run_reasoning_pipeline(context, objective, callback=capture_reasoning)
63 |         
64 |         # Check if final reasoning output matches correct answer
65 |         if reasoning_output:
66 |             final_result = reasoning_output[-1]["result"]
67 |             is_correct = verify_answer_match(final_result.reasoning_output, item["answer"])
68 |             pipeline_metrics["correct_answers"] += int(is_correct)
69 |             pipeline_metrics["total_iterations"] += len(reasoning_output)
70 |         
71 |         # Print progress
72 |         print(f"\nCurrent Progress: {i}/{pipeline_metrics['total_questions']}")
73 |         print(f"Iterations: {len(reasoning_output)}")
74 |         current_accuracy = pipeline_metrics["correct_answers"] / i
75 |         print(f"Current Accuracy: {current_accuracy:.1%}")
76 |     
77 |     elapsed_time = time.time() - start_time
78 |     print()  # New line after progress
79 |     
80 |     # Calculate final averages
81 |     pipeline_metrics["time_seconds"] = elapsed_time
82 |     pipeline_metrics["accuracy"] = pipeline_metrics["correct_answers"] / pipeline_metrics["total_questions"]
83 |     pipeline_metrics["average_iterations"] = pipeline_metrics["total_iterations"] / pipeline_metrics["total_questions"]
84 |     
85 |     # Save results
86 |     with open("reasoning_benchmark.json", "w") as f:
87 |         json.dump(pipeline_metrics, f, indent=2)
88 |     
89 |     print(f"Tested pipeline on {pipeline_metrics['total_questions']} questions in {elapsed_time:.2f} seconds")
90 |     print(f"Average Iterations: {pipeline_metrics['average_iterations']:.1f}")
91 |     print(f"Answer Accuracy: {pipeline_metrics['accuracy']:.1%}")
92 | 
93 | if __name__ == "__main__":
94 |     benchmark_reasoning_pipeline()
95 |     print("\nBenchmark results saved to reasoning_benchmark.json")
96 | 


--------------------------------------------------------------------------------
/deepseek_aider_evaluator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from deepeval.benchmarks import Aider
 4 | from deepeval.benchmarks.tasks import AiderTask
 5 | from deepeval.models import DeepSeekModel
 6 | import dspy
 7 | import json
 8 | from tqdm import tqdm
 9 | 
10 | class DeepSeekEvaluator:
11 |     def __init__(self):
12 |         # Configure DeepSeek model
13 |         self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=0.3, cache=False)
14 |         dspy.settings.configure(lm=self.lm)
15 |         
16 |         # Initialize DeepEval Aider benchmark
17 |         self.benchmark = Aider(
18 |             tasks=[
19 |                 AiderTask.CODE_EDITING,
20 |                 AiderTask.CODE_REFACTORING
21 |             ],
22 |             n=100  # Number of code generation samples
23 |         )
24 | 
25 |     def evaluate(self):
26 |         print("Starting DeepSeek evaluation on Aider benchmark...")
27 |         
28 |         # Create DeepSeek model wrapper for DeepEval
29 |         class DeepSeekWrapper(DeepSeekModel):
30 |             def __init__(self):
31 |                 super().__init__()
32 |                 self.model = self.lm
33 |                 
34 |             def generate_samples(self, prompt: str, n: int, temperature: float) -> tuple[str, float]:
35 |                 # Use DSPy's DeepSeek model for generation
36 |                 result = self.model(prompt)
37 |                 return result, 1.0  # Return generated text and confidence score
38 |                 
39 |             def load_model(self):
40 |                 # Initialize the model
41 |                 self.model = self.lm
42 | 
43 |         # Evaluate the model
44 |         self.benchmark.evaluate(model=DeepSeekWrapper(), k=10)
45 |         
46 |         # Print results
47 |         print("\nEvaluation Results:")
48 |         print(f"Overall Score: {self.benchmark.overall_score:.1%}")
49 |         print("\nTask-wise Scores:")
50 |         for task, score in self.benchmark.task_scores.items():
51 |             print(f"{task}: {score:.1%}")
52 | 
53 |         # Save results
54 |         results = {
55 |             "overall_score": self.benchmark.overall_score,
56 |             "task_scores": self.benchmark.task_scores,
57 |             "model": "deepseek/deepseek-chat",
58 |             "temperature": 0.3
59 |         }
60 |         
61 |         with open("deepseek_aider_results.json", "w") as f:
62 |             json.dump(results, f, indent=2)
63 |             
64 |         print("\nResults saved to deepseek_aider_results.json")
65 | 
66 | if __name__ == "__main__":
67 |     evaluator = DeepSeekEvaluator()
68 |     evaluator.evaluate()
69 | 


--------------------------------------------------------------------------------
/dynamic_fewshot_routing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from simpledspy import pipe
  4 | import numpy as np
  5 | from math_calculator import MathCalculator
  6 | import random
  7 | 
  8 | 
  9 | calculator = MathCalculator()
 10 | 
 11 | 
 12 | def generate_multiplication_task(num_digits):
 13 |     first_digit = np.random.randint(0, 10**num_digits)
 14 |     second_digit = np.random.randint(0, 10**num_digits)
 15 |     solution = first_digit * second_digit
 16 |     task = f"{first_digit} * {second_digit}"
 17 |     return task, solution
 18 | 
 19 | 
 20 | def generate_program_reasoning_task():
 21 |     num_digits = 4
 22 |     first_digit = np.random.randint(0, 10**num_digits)
 23 |     second_digit = np.random.randint(0, 10**num_digits)
 24 |     if second_digit > 5000:
 25 |         solution = 10
 26 |     else:
 27 |         solution = first_digit * second_digit
 28 |         # return 100
 29 |     task = f"f({first_digit}, {second_digit}) = "
 30 |     return task, solution
 31 | 
 32 | def sample_memories(memory):
 33 |     return_list = []
 34 |     # for sample in memory:
 35 |     for i, sample in enumerate(memory):
 36 |         # sample with prob of weight
 37 |         if random.random() < sample['weight']:
 38 |             sample['i'] = i
 39 |             return_list.append(sample)
 40 | 
 41 |     return return_list
 42 | 
 43 | def construct_prompt(fewshot_samples):
 44 |     fewshot_str = ""
 45 |     hypothesis_str = ""
 46 |     for sample in fewshot_samples:
 47 |         if 'task' in sample:
 48 |             # fewshot_str += f"Task: {sample['task']}\nOutput: {sample['output']}\n"
 49 |             fewshot_str += f"Task: {sample['task']}\nOutput: {sample['output']}\Output score: {sample['metric']}\n"
 50 |         elif 'hypothesis' in sample:
 51 |             hypothesis_str += f"Hypothesis: {sample['hypothesis']}\n"
 52 | 
 53 |     return fewshot_str, hypothesis_str
 54 | 
 55 | instruction = ''
 56 | metric_values = []
 57 | score_values = []
 58 | memory = []
 59 | iteration = 0
 60 | while True:
 61 | # for i in range(10):
 62 |     task, solution = generate_multiplication_task(4)
 63 |     # task, solution = generate_program_reasoning_task()
 64 |     memory_samples = sample_memories(memory)
 65 |     # memory_str = construct_prompt(memory_samples)
 66 |     fewshot_str, hypothesis_str = construct_prompt(memory_samples)
 67 |     memory_str = f"Hypothesis: {hypothesis_str}\n Fewshot Examples:\n{fewshot_str}"
 68 |     input_ = f"Current task: {task}\n{memory_str}Task: {task}\nOutput: "
 69 |     reasoning, result, new_hypothesis = pipe(instruction, input_)
 70 |     output = (reasoning, result)
 71 |     # check if number 
 72 |     if result.isdigit() and int(result) == solution:
 73 |         # metric_values.append(1)
 74 |         # metric_value = 1
 75 |         metric_value = min(1, 1/abs(float(result) - solution + 0.00001))
 76 |     else:
 77 |         # metric_values.append(0)
 78 |         metric_value = 0
 79 | 
 80 |     num_memory_samples = len(memory_samples)
 81 |     score = metric_value - (num_memory_samples/100)
 82 |     score_values.append(score)
 83 |     avg_score = np.mean(score_values[-100:])
 84 | 
 85 |     metric_values.append(metric_value)
 86 |     avg_metric = np.mean(metric_values[-100:])
 87 |     num_fewshot_samples = len([sample for sample in memory_samples if 'task' in sample])
 88 |     num_hypothesis_samples = len([sample for sample in memory_samples if 'hypothesis' in sample])
 89 |     weight = avg_score if avg_score > 0 else 0.1
 90 |     memory.append({'hypothesis': new_hypothesis, 'weight': weight})
 91 |     memory.append({'task': task, 'output': output, 'metric': metric_value, 'weight': weight})
 92 |     # if metric_value == 1:
 93 |         # memory.append({'task': task, 'output': output, 'metric': metric_value, 'weight': weight})
 94 | 
 95 |     for sample in memory_samples:
 96 |         penalty_factor = 0.5
 97 |         # if metric_value == 1:
 98 |         if score > avg_score:
 99 |             memory[sample['i']]['weight'] *= ((1-penalty_factor)/avg_metric) + penalty_factor
100 |         else:
101 |             memory[sample['i']]['weight'] *= penalty_factor
102 | 
103 |         if memory[sample['i']]['weight'] > 1:
104 |             memory[sample['i']]['weight'] = 1
105 | 
106 | 
107 | 
108 | 
109 |     print(f"input_: {input_}")
110 |     print(f"reasoning: {reasoning}")
111 |     print(f"hypothesis_str: {hypothesis_str}")
112 |     print(f'Hypothesis: {new_hypothesis}')
113 |     print(f"iter: {iteration}, task: {task}, Solution: {solution}, result: {result}, num_fs: {num_fewshot_samples}, num_hypo: {num_hypothesis_samples}, Avg Metric: {avg_metric}")
114 |     iteration += 1
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/jeopardy_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import json
  5 | from tqdm import tqdm
  6 | 
  7 | # Configure the LM with temperature=1.5 and no caching
  8 | lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.5, cache=False)
  9 | dspy.settings.configure(lm=lm)
 10 | 
 11 | # Define signatures for three-step question generation
 12 | class GenerateAnswerSignature(dspy.Signature):
 13 |     category = dspy.InputField(desc="The category for the question")
 14 |     answer = dspy.OutputField(desc="A challenging answer for a Jeopardy question. Generate just the answer, not the question.")
 15 | 
 16 | class GenerateInitialQuestionSignature(dspy.Signature):
 17 |     category = dspy.InputField(desc="The category for the question")
 18 |     answer = dspy.InputField(desc="The specific answer to create a question for")
 19 |     question = dspy.OutputField(desc="A Jeopardy-style clue that leads to the answer")
 20 | 
 21 | class GenerateHintSignature(dspy.Signature):
 22 |     category = dspy.InputField(desc="The category for the question")
 23 |     answer = dspy.InputField(desc="The specific answer to create a hint for")
 24 |     initial_question = dspy.InputField(desc="The initial question that directly leads to the answer")
 25 |     hint = dspy.OutputField(desc="An indirect clue that points to the answer without repeating information from the initial question")
 26 | 
 27 | class GenerateChallengingQuestionSignature(dspy.Signature):
 28 |     category = dspy.InputField(desc="The category for the question")
 29 |     answer = dspy.InputField(desc="The specific answer to create a question for")
 30 |     hint = dspy.InputField(desc="An indirect clue that points to the answer")
 31 |     question = dspy.OutputField(desc="A challenging Jeopardy-style clue that incorporates the hint and requires reasoning to reach the answer")
 32 | 
 33 | class JeopardyDatasetGenerator(dspy.Module):
 34 |     def __init__(self):
 35 |         super().__init__()
 36 |         self.generate_answer = dspy.ChainOfThought(GenerateAnswerSignature)
 37 |         self.generate_initial_question = dspy.ChainOfThought(GenerateInitialQuestionSignature)
 38 |         self.generate_hint = dspy.ChainOfThought(GenerateHintSignature)
 39 |         self.generate_challenging_question = dspy.ChainOfThought(GenerateChallengingQuestionSignature)
 40 | 
 41 |     def generate_dataset(self, categories, num_questions_per_category=1):
 42 |         dataset = []
 43 |         total_questions = len(categories) * num_questions_per_category
 44 |         
 45 |         # Single progress bar for all questions
 46 |         with tqdm(total=total_questions, desc="Generating Questions") as pbar:
 47 |             for category in categories:
 48 |                 for _ in range(num_questions_per_category):
 49 |                     # First generate a challenging answer
 50 |                     answer_result = self.generate_answer(category=category)
 51 |                     
 52 |                     # First generate an initial direct question
 53 |                     initial_question_result = self.generate_initial_question(
 54 |                         category=category,
 55 |                         answer=answer_result.answer
 56 |                     )
 57 |                     
 58 |                     # Generate a hint that points to the answer without repeating the initial question
 59 |                     hint_result = self.generate_hint(
 60 |                         category=category,
 61 |                         answer=answer_result.answer,
 62 |                         initial_question=initial_question_result.question
 63 |                     )
 64 |                     
 65 |                     # Generate a more challenging question using the hint
 66 |                     question_result = self.generate_challenging_question(
 67 |                         category=category,
 68 |                         answer=answer_result.answer,
 69 |                         hint=hint_result.hint
 70 |                     )
 71 |                     
 72 |                     # Create the dataset entry
 73 |                     entry = {
 74 |                         "category": category,
 75 |                         "question": question_result.question,
 76 |                         "answer": answer_result.answer,
 77 |                         "initial_question": initial_question_result.question,
 78 |                         "hint": hint_result.hint
 79 |                     }
 80 |                     dataset.append(entry)
 81 |                     
 82 |                     # Print formatted output
 83 |                     print("\nGenerated Question:")
 84 |                     print(f"Category: {entry['category']}")
 85 |                     print(f"Initial Question: {entry['initial_question']}")
 86 |                     print(f"Hint: {entry['hint']}")
 87 |                     print(f"Final Question: {entry['question']}")
 88 |                     print(f"Answer: {entry['answer']}")
 89 |                     print("-" * 80)
 90 |                     
 91 |                     # Update progress bar
 92 |                     pbar.update(1)
 93 |         return dataset
 94 | 
 95 | import argparse
 96 | 
 97 | if __name__ == "__main__":
 98 |     # Initialize the generator
 99 |     generator = JeopardyDatasetGenerator()
100 | 
101 |     # Define some categories
102 |     categories = [
103 |         "History",
104 |         "Science & Nature",
105 |         "Literature",
106 |         "Pop Culture",
107 |         "Geography",
108 |         "Technology",
109 |         "Computers",
110 |         "Artificial Intelligence",
111 |         "LLMs",
112 |         "Deep Learning",
113 |     ]
114 | 
115 |     # Set up argument parser
116 |     parser = argparse.ArgumentParser(description="Generate Jeopardy questions")
117 |     parser.add_argument("-n", "--num_questions", type=int, default=50,
118 |                        help="Number of questions to generate (default: 50)")
119 |     args = parser.parse_args()
120 | 
121 |     # Calculate number of questions per category
122 |     num_categories = len(categories)
123 |     base_questions = args.num_questions // num_categories
124 |     extra_questions = args.num_questions % num_categories
125 | 
126 |     # Generate questions, cycling through categories
127 |     dataset = []
128 |     for i in range(num_categories):
129 |         questions_to_generate = base_questions + (1 if i < extra_questions else 0)
130 |         if questions_to_generate > 0:
131 |             category_questions = generator.generate_dataset(
132 |                 [categories[i]],
133 |                 num_questions_per_category=questions_to_generate
134 |             )
135 |             dataset.extend(category_questions)
136 | 
137 |     # Save to JSON file
138 |     with open("jeopardy_dataset.json", "w") as f:
139 |         json.dump(dataset, f, indent=2)
140 | 
141 |     print(f"Generated {len(dataset)} Jeopardy questions!")
142 |     print("Dataset saved to jeopardy_dataset.json")
143 | 


--------------------------------------------------------------------------------
/math_calculator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import math
  5 | import json
  6 | import time
  7 | import tqdm
  8 | from pprint import pprint
  9 | from collections import Counter
 10 | from concurrent.futures import ThreadPoolExecutor, as_completed
 11 | from typing import List, Dict, Any
 12 | from signatures import (
 13 |     SolutionSelectorSignature, 
 14 |     MathCalculationSignature,
 15 |     TaskSplitterSignature,
 16 |     SubtaskResultSelectorSignature
 17 | )
 18 | from math_evaluator import MathEvaluator
 19 | 
 20 | class MathCalculator(dspy.Module):
 21 |     """Base math calculator module that ProblemSolver extends"""
 22 |     def __init__(self):
 23 |         super().__init__()
 24 |         self.calculate = dspy.ChainOfThought(MathCalculationSignature)
 25 | 
 26 |     def forward(self, task):
 27 |         """Basic forward pass without advanced reasoning"""
 28 |         result = self.calculate(task=task)
 29 |         return dspy.Prediction(
 30 |             reasoning=result.reasoning,
 31 |             solution=result.solution,
 32 |             notes_output=result.notes_output
 33 |         )
 34 | 
 35 | class ProblemSolver(dspy.Module):
 36 |     def __init__(self, max_iterations=5, num_attempts=3, subtask_attempts=3):
 37 |         """Initialize the ProblemSolver with DSPy modules and configuration.
 38 |         
 39 |         Args:
 40 |             max_iterations: Maximum number of reasoning iterations per attempt
 41 |             num_attempts: Number of attempts to solve each task
 42 |             subtask_attempts: Number of attempts to solve each subtask
 43 |         """
 44 |         super().__init__()
 45 |         # Initialize instance variables first
 46 |         self.max_iterations = max_iterations
 47 |         self.num_attempts = num_attempts
 48 |         self.subtask_attempts = subtask_attempts
 49 |         
 50 |         self.reasoning_tree = {
 51 |             'root': None,
 52 |             'nodes': {},
 53 |             'metadata': {
 54 |                 'start_time': time.time(),
 55 |                 'config': {
 56 |                     'max_iterations': self.max_iterations,
 57 |                     'num_attempts': self.num_attempts,
 58 |                     'subtask_attempts': self.subtask_attempts
 59 |                 }
 60 |             }
 61 |         }
 62 |         self.current_node_id = 0
 63 |         
 64 |         # Initialize DSPy modules
 65 |         self.calculate = dspy.ChainOfThought(MathCalculationSignature)
 66 |         self.select_solution = dspy.ChainOfThought(SolutionSelectorSignature)
 67 |         self.split_task = dspy.ChainOfThought(TaskSplitterSignature)
 68 |         self.select_subtask_result = dspy.ChainOfThought(SubtaskResultSelectorSignature)
 69 | 
 70 |     def _create_node(self, task, parent_id=None, node_type='task', input_data=None, output_data=None):
 71 |         """Create a new node in the reasoning tree with input/output tracking"""
 72 |         node_id = f"node_{self.current_node_id}"
 73 |         self.current_node_id += 1
 74 |         
 75 |         node = {
 76 |             'id': node_id,
 77 |             'type': node_type,
 78 |             'task': task,
 79 |             'parent': parent_id,
 80 |             'children': [],
 81 |             'attempts': [],
 82 |             'input': input_data if input_data else {},
 83 |             'output': output_data if output_data else {},
 84 |             'timestamp': time.time()
 85 |         }
 86 |         
 87 |         self.reasoning_tree['nodes'][node_id] = node
 88 |         
 89 |         if parent_id:
 90 |             self.reasoning_tree['nodes'][parent_id]['children'].append(node_id)
 91 |             
 92 |         if not self.reasoning_tree['root']:
 93 |             self.reasoning_tree['root'] = node_id
 94 |             
 95 |         return node_id
 96 | 
 97 |     def _split_task(self, task, depth=0, max_depth=3):
 98 |         """Split a general problem into subtasks using DSPy reasoning"""
 99 |         if depth >= max_depth:
100 |             print(f"Max recursion depth {max_depth} reached for task: {task}")
101 |             return [task]
102 |             
103 |         try:
104 |             # Log task splitting attempt
105 |             print(f"Attempting to split task (Depth {depth}): {task}")
106 |             
107 |             result = self.split_task(task=task, context="")
108 |             if not hasattr(result, 'subtasks'):
109 |                 print(f"Failed to split task - no subtasks returned: {task}")
110 |                 return [task]
111 |                 
112 |             # Parse subtasks from the output
113 |             subtasks = []
114 |             if isinstance(result.subtasks, str):
115 |                 subtasks = [s.strip() for s in result.subtasks.split('\n') if s.strip()]
116 |             elif isinstance(result.subtasks, list):
117 |                 subtasks = [str(s).strip() for s in result.subtasks if str(s).strip()]
118 |                 
119 |             # Log the split reasoning and results
120 |             print(f"Task Split Reasoning (Depth {depth}):\n{result.split_reasoning}")
121 |             print(f"Generated Subtasks: {subtasks}")
122 |             
123 |             # Recursively split subtasks if needed
124 |             final_subtasks = []
125 |             for subtask in subtasks:
126 |                 try:
127 |                     # Only split further if the subtask is complex enough
128 |                     if len(subtask.split()) > 5:  # Simple heuristic based on length
129 |                         final_subtasks.extend(self._split_task(subtask, depth+1, max_depth))
130 |                     else:
131 |                         final_subtasks.append(subtask)
132 |                 except Exception as e:
133 |                     print(f"Error recursively splitting subtask {subtask}: {e}")
134 |                     final_subtasks.append(subtask)
135 |                     
136 |             return final_subtasks if final_subtasks else [task]
137 |         except Exception as e:
138 |             print(f"Error splitting task {task}: {e}")
139 |             return [task]
140 | 
141 |     def _combine_subtask_results(self, subtask_results: List[dspy.Prediction]) -> Dict[str, Any]:
142 |         """Combine results from DSPy-generated subtasks"""
143 |         if not subtask_results:
144 |             return dspy.Prediction(
145 |                 reasoning="No subtask results to combine",
146 |                 solution=None,
147 |                 notes_output=""
148 |             )
149 |             
150 |         # Build combined reasoning
151 |         combined_reasoning = []
152 |         combined_solution = []
153 |         
154 |         for i, result in enumerate(subtask_results, 1):
155 |             combined_reasoning.append(
156 |                 f"Subtask {i}:\n"
157 |                 f"Reasoning: {result.reasoning}\n"
158 |                 f"Solution: {result.solution}\n"
159 |             )
160 |             if result.solution:
161 |                 combined_solution.append(str(result.solution))
162 |                 
163 |         # Combine solutions in a meaningful way
164 |         final_solution = "\n".join(combined_solution) if combined_solution else "No solution found"
165 |             
166 |         return dspy.Prediction(
167 |             reasoning="Combined subtask results:\n" + "\n".join(combined_reasoning),
168 |             solution=final_solution,
169 |             notes_output="Combined results from subtasks"
170 |         )
171 | 
172 |     def forward(self, task):
173 |         """Forward pass for the math calculator with recursive task splitting"""
174 |         # First try to split the task into subtasks recursively
175 |         subtasks = self._split_task(task, max_depth=3)  # Set max recursion depth
176 |         
177 |         if len(subtasks) > 1:
178 |             # Process each subtask independently with multiple attempts
179 |             subtask_results = []
180 |             for subtask in subtasks:
181 |                 if subtask in ['+', '-', '*', '/', '^', '√', '%']:
182 |                     # Keep operators as-is
183 |                     subtask_results.append(dspy.Prediction(
184 |                         reasoning="Operator",
185 |                         solution=subtask,
186 |                         notes_output=""
187 |                     ))
188 |                 else:
189 |                     # Process numerical subtasks with multiple attempts
190 |                     result = self._process_subtask(subtask)
191 |                     subtask_results.append(result)
192 |             
193 |             # Combine subtask results
194 |             final_solution = self._combine_subtask_results(subtask_results)
195 |             final_reasoning = "\n".join(
196 |                 f"Subtask {i+1} ({subtask}):\n{r.reasoning}\nSolution: {r.solution}\n" 
197 |                 for i, (subtask, r) in enumerate(zip(subtasks, subtask_results))
198 |             )
199 |             
200 |             return dspy.Prediction(
201 |                 reasoning=f"Task split into {len(subtasks)} subtasks:\n{final_reasoning}",
202 |                 solution=final_solution,
203 |                 notes_output="Task split into subtasks"
204 |             )
205 |             
206 |         # Fall back to original processing if no subtasks found
207 |         attempts = []
208 |         
209 |         # Run multiple attempts
210 |         for attempt in range(self.num_attempts):
211 |             context = ""
212 |             final_reasoning = ""
213 |             final_solution = ""
214 |             
215 |             for iteration in range(self.max_iterations):
216 |                 try:
217 |                     result = self.calculate(task=task, context=context)
218 |                     
219 |                     # Validate required fields
220 |                     if not all(hasattr(result, field) for field in ['reasoning', 'solution', 'notes_output', 'iteration_control']):
221 |                         raise ValueError("Missing required fields in model output")
222 |                         
223 |                     # Accumulate reasoning
224 |                     final_reasoning += f"\nAttempt {attempt + 1}, Iteration {iteration + 1} Reasoning:\n{result.reasoning}"
225 |                     
226 |                     # Build context for next iteration
227 |                     iteration_context = (
228 |                         f"Iteration {iteration + 1}:\n"
229 |                         f"Reasoning: {result.reasoning}\n"
230 |                         f"Solution: {result.solution}\n"
231 |                         f"Notes: {result.notes_output}\n"
232 |                     )
233 |                     context += "\n" + iteration_context
234 |                     
235 |                     # Store the latest solution
236 |                     final_solution = result.solution
237 |                 
238 |                     # Check if we should terminate
239 |                     if result.iteration_control.lower().strip() == "terminate":
240 |                         break
241 |                         
242 |                 except ValueError as e:
243 |                     print(f"Validation error in attempt {attempt + 1}, iteration {iteration + 1}: {str(e)}")
244 |                     continue
245 |                 except RuntimeError as e:
246 |                     print(f"Runtime error in attempt {attempt + 1}, iteration {iteration + 1}: {str(e)}")
247 |                     continue
248 |                 except Exception as e:
249 |                     print(f"Unexpected error in attempt {attempt + 1}, iteration {iteration + 1}: {str(e)}")
250 |                     continue
251 |                     
252 |             attempts.append({
253 |                 'reasoning': final_reasoning,
254 |                 'solution': final_solution,
255 |                 'notes_output': context
256 |             })
257 |         
258 |         # Select the best solution
259 |         selection_result = self.select_solution(
260 |             task=task,
261 |             solutions=[f"Attempt {i+1}:\nReasoning: {a['reasoning']}\nSolution: {a['solution']}" 
262 |                       for i, a in enumerate(attempts)],
263 |             selection_criteria="Select the solution that is mathematically correct, logically consistent, "
264 |                              "has clear reasoning, and provides a complete solution to the task"
265 |         )
266 |         
267 |         # Find the selected solution
268 |         selected_solution = selection_result.selected_solution
269 |         selection_reasoning = selection_result.selection_reasoning
270 |         
271 |         # Try to match the selected solution
272 |         for attempt in attempts:
273 |             if attempt['solution'] == selected_solution:
274 |                 # Add selection reasoning to the final output
275 |                 final_reasoning = (
276 |                     f"Selected Solution Reasoning:\n{selection_reasoning}\n\n"
277 |                     f"Solution Details:\n{attempt['reasoning']}"
278 |                 )
279 |                 return dspy.Prediction(
280 |                     reasoning=final_reasoning,
281 |                     solution=attempt['solution'],
282 |                     notes_output=attempt['notes_output']
283 |                 )
284 |                 
285 |         # If no solution was selected, choose the most consistent one
286 |         if len(attempts) > 1:
287 |             # Find the most common solution
288 |             solution_counts = Counter(a['solution'] for a in attempts)
289 |             most_common_solution = solution_counts.most_common(1)[0][0]
290 |             
291 |             # Return the first attempt with the most common solution
292 |             for attempt in attempts:
293 |                 if attempt['solution'] == most_common_solution:
294 |                     final_reasoning = (
295 |                         "No clear selection - using most consistent solution:\n"
296 |                         f"Solution appeared {solution_counts[most_common_solution]} times\n\n"
297 |                         f"Solution Details:\n{attempt['reasoning']}"
298 |                     )
299 |                     return dspy.Prediction(
300 |                         reasoning=final_reasoning,
301 |                         solution=attempt['solution'],
302 |                         notes_output=attempt['notes_output']
303 |                     )
304 |                     
305 |         # Fall back to the first attempt
306 |         final_reasoning = (
307 |             "Using first attempt as fallback solution\n\n"
308 |             f"Solution Details:\n{attempts[0]['reasoning']}"
309 |         )
310 |         return dspy.Prediction(
311 |             reasoning=final_reasoning,
312 |             solution=attempts[0]['solution'],
313 |             notes_output=attempts[0]['notes_output']
314 |         )
315 | 
316 |     def evaluate_on_dataset(self, dataset_path="math_dataset.json", max_iter=None, num_threads=10):
317 |         evaluator = MathEvaluator(self, num_threads)
318 |         return evaluator.evaluate_on_dataset(dataset_path)
319 |         
320 |     def _process_subtask(self, subtask, parent_id=None):
321 |         """Process a subtask with multiple attempts and select the best result"""
322 |         # Create node for this subtask
323 |         subtask_node_id = self._create_node(
324 |             task=subtask,
325 |             parent_id=parent_id,
326 |             node_type='subtask',
327 |             input_data={
328 |                 'subtask': subtask,
329 |                 'parent_id': parent_id,
330 |                 'timestamp': time.time()
331 |             }
332 |         )
333 |         
334 |         attempts = []
335 |         
336 |         for attempt in range(self.subtask_attempts):
337 |             # Create node for this attempt
338 |             attempt_node_id = self._create_node(
339 |                 task=subtask,
340 |                 parent_id=subtask_node_id,
341 |                 node_type='attempt',
342 |                 input_data={
343 |                     'attempt_number': attempt + 1,
344 |                     'subtask': subtask,
345 |                     'timestamp': time.time()
346 |                 }
347 |             )
348 |             try:
349 |                 result = self._forward_with_max_iter(subtask, self.max_iterations)
350 |                 
351 |                 # Update attempt node with output
352 |                 self.reasoning_tree['nodes'][attempt_node_id]['output'] = {
353 |                     'reasoning': result.reasoning,
354 |                     'solution': result.solution,
355 |                     'notes': result.notes_output,
356 |                     'timestamp': time.time()
357 |                 }
358 |                 
359 |                 attempts.append({
360 |                     'reasoning': result.reasoning,
361 |                     'solution': result.solution,
362 |                     'notes': result.notes_output
363 |                 })
364 |             except Exception as e:
365 |                 print(f"Error in subtask attempt {attempt + 1}: {e}")
366 |                 continue
367 |                 
368 |         # Select the best result using DSPy
369 |         if len(attempts) > 1:
370 |             selection_result = self.select_subtask_result(
371 |                 subtask=subtask,
372 |                 attempts=[f"Attempt {i+1}:\nReasoning: {a['reasoning']}\nSolution: {a['solution']}" 
373 |                          for i, a in enumerate(attempts)]
374 |             )
375 |             
376 |             # Find the selected solution
377 |             for attempt in attempts:
378 |                 if attempt['solution'] == selection_result.selected_solution:
379 |                     return dspy.Prediction(
380 |                         reasoning=f"Selected Solution Reasoning:\n{selection_result.selection_reasoning}\n\n"
381 |                                 f"Solution Details:\n{attempt['reasoning']}",
382 |                         solution=attempt['solution'],
383 |                         notes_output=attempt['notes']
384 |                     )
385 |                     
386 |         # If no selection or only one attempt, return the first result
387 |         if attempts:
388 |             return dspy.Prediction(
389 |                 reasoning=attempts[0]['reasoning'],
390 |                 solution=attempts[0]['solution'],
391 |                 notes_output=attempts[0]['notes']
392 |             )
393 |             
394 |         # Fallback if all attempts failed
395 |         return dspy.Prediction(
396 |             reasoning="All attempts failed to solve the subtask",
397 |             solution="0",
398 |             notes_output=""
399 |         )
400 | 
401 |     def _forward_with_max_iter(self, task, max_iter):
402 |         """Modified forward pass with configurable max iterations"""
403 |         context = ""
404 |         final_reasoning = ""
405 |         final_solution = ""
406 |         
407 |         for iteration in range(max_iter):
408 |             try:
409 |                 result = self.calculate(task=task, context=context)
410 |                 
411 |                 # Validate required fields
412 |                 if not all(hasattr(result, field) for field in ['reasoning', 'solution', 'notes_output', 'iteration_control']):
413 |                     raise ValueError("Missing required fields in model output")
414 |                     
415 |                 # Accumulate reasoning
416 |                 final_reasoning += f"\nIteration {iteration + 1} Reasoning:\n{result.reasoning}"
417 |                 
418 |                 # Build context for next iteration
419 |                 iteration_context = (
420 |                     f"Iteration {iteration + 1}:\n"
421 |                     f"Reasoning: {result.reasoning}\n"
422 |                     f"Solution: {result.solution}\n"
423 |                     f"Notes: {result.notes_output}\n"
424 |                 )
425 |                 context += "\n" + iteration_context
426 |                 
427 |                 # Store the latest solution
428 |                 final_solution = result.solution
429 |                 
430 |                 # Check if we should terminate
431 |                 if result.iteration_control.lower().strip() == "terminate":
432 |                     break
433 |                     
434 |             except ValueError as e:
435 |                 print(f"Validation error in iteration {iteration + 1}: {str(e)}")
436 |                 continue
437 |             except RuntimeError as e:
438 |                 print(f"Runtime error in iteration {iteration + 1}: {str(e)}")
439 |                 continue
440 |             except Exception as e:
441 |                 print(f"Unexpected error in iteration {iteration + 1}: {str(e)}")
442 |                 continue
443 |                 
444 |         return dspy.Prediction(
445 |             reasoning=final_reasoning,
446 |             solution=final_solution,
447 |             notes_output=context
448 |         )
449 |         
450 |     def save_reasoning_tree(self, path="reasoning_tree.json"):
451 |         """Save the full reasoning tree to a JSON file with enhanced details"""
452 |         # Add final metadata
453 |         self.reasoning_tree['metadata']['end_time'] = time.time()
454 |         self.reasoning_tree['metadata']['duration'] = (
455 |             self.reasoning_tree['metadata']['end_time'] - 
456 |             self.reasoning_tree['metadata']['start_time']
457 |         )
458 |         
459 |         # Save with pretty printing
460 |         with open(path, "w") as f:
461 |             json.dump(self.reasoning_tree, f, indent=2, sort_keys=True)
462 |             
463 |         print(f"Reasoning tree saved to {path}")
464 |         print(f"Total nodes: {len(self.reasoning_tree['nodes'])}")
465 |         print(f"Duration: {self.reasoning_tree['metadata']['duration']:.2f}s")
466 | 
467 |     def _is_correct(self, predicted, expected):
468 |         """Compare solutions with tolerance for floating point"""
469 |         try:
470 |             # Handle both string and numeric inputs
471 |             predicted_num = float(predicted) if isinstance(predicted, str) else float(predicted)
472 |             expected_num = float(expected) if isinstance(expected, str) else float(expected)
473 |             
474 |             # Handle NaN and infinity
475 |             if math.isnan(predicted_num) or math.isnan(expected_num):
476 |                 return False
477 |             if math.isinf(predicted_num) or math.isinf(expected_num):
478 |                 return False
479 |                 
480 |             # Compare with tolerance
481 |             return abs(predicted_num - expected_num) < 0.01
482 |         except (ValueError, TypeError) as e:
483 |             print(f"⚠️ Error evaluating solution - invalid number format: {str(e)}")
484 |             return False
485 |         except Exception as e:
486 |             print(f"⚠️ Unexpected error evaluating solution: {str(e)}")
487 |             return False
488 |             
489 | 
490 | if __name__ == "__main__":
491 |     # Configure DSPy
492 |     lm = dspy.LM(model="deepseek/deepseek-chat", temperature=0.3, cache=False)
493 |     dspy.settings.configure(lm=lm)
494 |     
495 |     # Create calculator instance with subtask processing
496 |     calculator = ProblemSolver(max_iterations=3, num_attempts=2, subtask_attempts=2)
497 |     
498 |     # Test complex task that should be split into subtasks
499 |     complex_task = "Calculate (3 + 4) * (5 - 2) / (6 + 3)"
500 |     
501 |     print(f"\nProcessing complex task: {complex_task}")
502 |     result = calculator.forward(complex_task)
503 |     
504 |     print("\nFinal Result:")
505 |     print(f"Reasoning:\n{result.reasoning}")
506 |     print(f"Solution: {result.solution}")
507 |     
508 |     # Save result
509 |     with open("subtask_result.json", "w") as f:
510 |         json.dump({
511 |             "task": complex_task,
512 |             "reasoning": result.reasoning,
513 |             "solution": result.solution
514 |         }, f, indent=2)
515 |     print("\nResult saved to subtask_result.json")
516 | 


--------------------------------------------------------------------------------
/math_calculator_optimizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import json
  5 | from dspy.teleprompt import MIPROv2, BootstrapFewShotWithRandomSearch, BootstrapFewShot
  6 | from math_calculator import MathCalculator, MathCalculationSignature
  7 | import tqdm
  8 | import logging
  9 | 
 10 | logging.basicConfig(filename='optimization_log.txt', level=logging.INFO)
 11 | 
 12 | # Set global tqdm configuration
 13 | tqdm.tqdm.pandas()
 14 | tqdm.tqdm.get_lock().locks = []
 15 | tqdm.tqdm.ncols = 60
 16 | 
 17 | class MathOptimizer:
 18 |     def __init__(self):
 19 |         self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.5, cache=False)
 20 |         dspy.settings.configure(lm=self.lm)
 21 |         self.calculator = MathCalculator()
 22 |         self.student = None
 23 |         self.teacher = None
 24 |         
 25 |     def set_student(self, student):
 26 |         """Set the student model for optimization"""
 27 |         self.student = student
 28 |         
 29 |     def set_teacher(self, teacher):
 30 |         """Set the teacher model for optimization"""
 31 |         self.teacher = teacher
 32 |         
 33 |     def load_dataset(self, dataset_path="math_dataset.json"):
 34 |         with open(dataset_path) as f:
 35 |             dataset = json.load(f)
 36 |         # First 100 samples for validation, rest for training
 37 |         return dataset[100:], dataset[:100]
 38 | 
 39 |     def create_trainset(self, dataset):
 40 |         trainset = []
 41 |         # for item in tqdm.tqdm(dataset[:100], ncols=60):
 42 |         for item in dataset:
 43 |             trainset.append(dspy.Example(
 44 |                 task=item['task'],
 45 |                 solution=item['solution']
 46 |             ).with_inputs('task'))
 47 |         return trainset
 48 | 
 49 |     def optimize(self, trainset, num_candidates=10, base_model=None):
 50 |         # Define the metric function with subtask reasoning evaluation
 51 |         def metric(example, prediction, trace=None):
 52 |             try:
 53 |                 # Handle both string and numeric solutions
 54 |                 pred_solution = float(prediction.solution) if isinstance(prediction.solution, str) else prediction.solution
 55 |                 exp_solution = float(example.solution) if isinstance(example.solution, str) else example.solution
 56 |                 
 57 |                 # Compare with tolerance for floating point numbers
 58 |                 accuracy = int(abs(pred_solution - exp_solution) < 0.01)
 59 |                 
 60 |                 # Evaluate subtask reasoning quality
 61 |                 if hasattr(prediction, 'reasoning'):
 62 |                     reasoning = prediction.reasoning.lower()
 63 |                     # Check for subtask indicators
 64 |                     if 'subtask' in reasoning or 'step' in reasoning or 'part' in reasoning:
 65 |                         # Additional points for using subtask reasoning
 66 |                         accuracy += 1
 67 |                         # Check for proper combination of subtasks
 68 |                         if 'combine' in reasoning or 'final result' in reasoning:
 69 |                             accuracy += 1
 70 |                 
 71 |                 return min(accuracy, 1)  # Cap at 1 to maintain binary metric
 72 |             except (ValueError, TypeError, AttributeError) as e:
 73 |                 print(f"Metric error: {e}")
 74 |                 return 0
 75 | 
 76 |         # Configure MIPRO optimizer with subtask reasoning focus
 77 |         teleprompter = MIPROv2(
 78 |             metric=metric,
 79 |             num_candidates=num_candidates,
 80 |             init_temperature=1.0,
 81 |             prompt_model=self.lm,
 82 |             task_model=self.lm,
 83 |             num_threads=100,
 84 |             auto='light',
 85 |             track_stats=True
 86 |         )
 87 | 
 88 |         # Set student and teacher if not already set
 89 |         if self.student is None:
 90 |             self.set_student(base_model)
 91 |         if self.teacher is None:
 92 |             self.set_teacher(base_model)
 93 | 
 94 |         # Run optimization with subtask reasoning focus
 95 |         optimized_calculator = teleprompter.compile(
 96 |             student=self.student,
 97 |             teacher=self.teacher,
 98 |             trainset=trainset,
 99 |             num_trials=7,
100 |             max_bootstrapped_demos=3,
101 |             max_labeled_demos=4,
102 |             requires_permission_to_run=False,
103 |             minibatch=True,
104 |         )
105 | 
106 | 
107 |         return optimized_calculator
108 | 
109 |     def save_optimized_model(self, optimized_calculator, path="optimized_models/optimized_math_calculator.json"):
110 |         optimized_calculator.save(path)
111 |         print(f"Optimized model saved to {path}")
112 | 
113 | from concurrent.futures import ThreadPoolExecutor, as_completed
114 | 
115 | def evaluate_single_task(calculator, item):
116 |     try:
117 |         result = calculator.forward(item['task'])
118 |         # Handle both string and numeric solutions
119 |         pred_solution = float(result.solution) if isinstance(result.solution, str) else result.solution
120 |         exp_solution = float(item['solution']) if isinstance(item['solution'], str) else item['solution']
121 |         
122 |         # Compare with tolerance for floating point numbers
123 |         return int(abs(pred_solution - exp_solution) < 0.01)
124 |     except (ValueError, TypeError, AttributeError) as e:
125 |         print(f"Evaluation error for task {item['task']}: {e}")
126 |         return 0
127 | 
128 | def evaluate_model(calculator, dataset, num_threads=10):
129 |     correct = 0
130 |     with ThreadPoolExecutor(max_workers=num_threads) as executor:
131 |         futures = [
132 |             executor.submit(evaluate_single_task, calculator, item)
133 |             for item in dataset[:100]
134 |         ]
135 |         for future in tqdm.tqdm(as_completed(futures), total=len(futures), 
136 |                               ncols=60):
137 |             correct += future.result()
138 |     return correct / 100  # Return accuracy
139 | 
140 | if __name__ == "__main__":
141 |     optimizer = MathOptimizer()
142 |     
143 |     # Load and split dataset
144 |     train_data, val_data = optimizer.load_dataset()
145 |     trainset = optimizer.create_trainset(train_data)
146 |     
147 |     # Initialize results tracking
148 |     results = []
149 |     current_calculator = optimizer.calculator
150 |     
151 |     # Evaluate initial model on validation set
152 |     print("Evaluating initial model...")
153 |     initial_accuracy = evaluate_model(current_calculator, val_data, num_threads=20)
154 |     results.append(("Initial", initial_accuracy))
155 |     print(f"Initial accuracy: {initial_accuracy:.1%}")
156 | 
157 |     current_calculator_student = current_calculator.deepcopy()
158 |     
159 |     # Run multiple optimization iterations with memory cleanup
160 |     num_iterations = 1
161 |     for i in range(num_iterations):
162 |         print(f"\nStarting optimization iteration {i+1}/{num_iterations}...")
163 |         current_calculator = current_calculator.deepcopy()
164 |         current_calculator = current_calculator.reset_copy()
165 |         
166 |         # Set student and teacher for this iteration
167 |         optimizer.set_student(current_calculator_student)
168 |         optimizer.set_teacher(current_calculator)
169 |         
170 |         # Run optimization on current calculator
171 |         optimized_calculator = optimizer.optimize(trainset, num_candidates=3, base_model=current_calculator)
172 |         
173 |         # Evaluate optimized model on validation set
174 |         accuracy = evaluate_model(optimized_calculator, val_data, num_threads=20)
175 |         
176 |         # Explicit memory cleanup
177 |         del current_calculator
178 |         import gc
179 |         gc.collect()
180 |         current_calculator = optimized_calculator
181 |         results.append((f"Iteration {i+1}", accuracy))
182 |         print(f"Optimization iteration {i+1} accuracy: {accuracy:.1%}")
183 |         
184 |         # Save optimized model to optimized_models directory
185 |         import os
186 |         os.makedirs("optimized_models", exist_ok=True)
187 |         model_path = f"optimized_models/optimized_math_calculator_iter{i+1}.json"
188 |         optimizer.save_optimized_model(optimized_calculator, model_path)
189 |         
190 |         # Set as current calculator for next iteration
191 |         current_calculator = optimized_calculator
192 |     
193 |     # Print all results
194 |     print("\nFinal Results:")
195 |     for stage, accuracy in results:
196 |         print(f"{stage}: {accuracy:.1%}")
197 |     
198 |     print("Optimization complete!")
199 | 


--------------------------------------------------------------------------------
/math_dataset_generator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import math
  5 | import random
  6 | import json
  7 | from tqdm import tqdm
  8 | 
  9 | class MathDatasetGenerator:
 10 |     def __init__(self):
 11 |         # Configurable parameters for difficulty
 12 |         self.basic_operators = ['+', '-', '*', '/']
 13 |         self.advanced_operators = ['^', '√', '%']  # Exponentiation, square root, modulo
 14 |         self.use_advanced_ops = False  # Toggle advanced operators
 15 |         self.parentheses_prob = 0.3  # Probability of adding parentheses
 16 |         self.min_num = -10000  # Minimum number value
 17 |         self.max_num = 10000  # Maximum number value
 18 |         self.min_ops = 5  # Minimum operations per expression
 19 |         self.max_ops = 15  # Maximum operations per expression
 20 |         self.allow_decimals = False  # Allow decimal numbers
 21 |         self.allow_negatives = False  # Allow negative numbers
 22 |         self.allow_variables = False  # Include variables in expressions
 23 |         self.variables = ['x', 'y', 'z']  # Available variables
 24 |         
 25 |     def _generate_number(self):
 26 |         if self.allow_decimals:
 27 |             return round(random.uniform(self.min_num, self.max_num), 2)
 28 |         return random.randint(self.min_num, self.max_num)
 29 |         
 30 |     def _generate_expression(self):
 31 |         # Generate number of operations
 32 |         num_ops = random.randint(self.min_ops, self.max_ops)
 33 |         
 34 |         # Choose starting element (number or variable)
 35 |         if self.allow_variables and random.random() < 0.3:  # 30% chance to start with variable
 36 |             expression = random.choice(self.variables)
 37 |         else:
 38 |             expression = str(self._generate_number())
 39 |         
 40 |         for _ in range(num_ops):
 41 |             # Choose operator
 42 |             if self.use_advanced_ops and random.random() < 0.5:  # 50% chance for advanced op
 43 |                 op = random.choice(self.advanced_operators)
 44 |             else:
 45 |                 op = random.choice(self.basic_operators)
 46 |             
 47 |             # Choose next element (number or variable)
 48 |             if self.allow_variables and random.random() < 0.3:  # 30% chance for variable
 49 |                 next_element = random.choice(self.variables)
 50 |             else:
 51 |                 next_element = str(self._generate_number())
 52 |             
 53 |             # Handle special operators
 54 |             if op == '√':  # Square root
 55 |                 expression = f"{op}({expression})"
 56 |             elif op == '^':  # Exponentiation
 57 |                 expression = f"({expression}){op}{next_element}"
 58 |             else:
 59 |                 # Decide whether to add parentheses
 60 |                 if random.random() < self.parentheses_prob:
 61 |                     expression = f"({expression} {op} {next_element})"
 62 |                 else:
 63 |                     expression = f"{expression} {op} {next_element}"
 64 |                 
 65 |         return expression
 66 | 
 67 |     def generate_dataset(self, num_tasks=100):
 68 |         dataset = []
 69 |         
 70 |         for _ in tqdm(range(num_tasks), desc="Generating Math Tasks", ncols=60):
 71 |             expression = self._generate_expression()
 72 |             
 73 |             # Calculate solution using eval (safe since we control the input)
 74 |             try:
 75 |                 # Use a safer expression evaluator
 76 |                 from ast import literal_eval
 77 |                 try:
 78 |                     # First try evaluating as-is
 79 |                     solution = literal_eval(expression)
 80 |                 except (ValueError, SyntaxError):
 81 |                     # If that fails, try evaluating as a math expression
 82 |                     import operator
 83 |                     import math
 84 |                     allowed_operators = {
 85 |                         '+': operator.add,
 86 |                         '-': operator.sub,
 87 |                         '*': operator.mul,
 88 |                         '/': operator.truediv,
 89 |                         '^': operator.pow,
 90 |                         '%': operator.mod,
 91 |                         '√': math.sqrt
 92 |                     }
 93 |                     # Parse and evaluate the expression safely
 94 |                     stack = []
 95 |                     for token in expression.split():
 96 |                         if token in allowed_operators:
 97 |                             if token == '√':
 98 |                                 operand = stack.pop()
 99 |                                 stack.append(allowed_operators[token](operand))
100 |                             else:
101 |                                 right = stack.pop()
102 |                                 left = stack.pop()
103 |                                 stack.append(allowed_operators[token](left, right))
104 |                         else:
105 |                             try:
106 |                                 stack.append(float(token))
107 |                             except ValueError:
108 |                                 stack.append(0)  # Default to 0 for invalid tokens
109 |                     solution = stack[0] if stack else 0
110 |                 # Round to 2 decimal places for division results
111 |                 if isinstance(solution, float):
112 |                     solution = round(solution, 2)
113 |                 
114 |                 dataset.append({
115 |                     'task': expression,
116 |                     'solution': solution
117 |                 })
118 |             except ZeroDivisionError:
119 |                 continue
120 |             
121 |                 
122 |         return dataset
123 | 
124 | if __name__ == "__main__":
125 |     # Generate dataset
126 |     generator = MathDatasetGenerator()
127 |     dataset = generator.generate_dataset(num_tasks=10000)
128 |     
129 |     # Save to file
130 |     with open("math_dataset.json", "w") as f:
131 |         json.dump(dataset, f, indent=2)
132 |     
133 |     print(f"Generated {len(dataset)} math tasks. Saved to math_dataset.json")
134 | 


--------------------------------------------------------------------------------
/math_evaluator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | import tqdm
 4 | from collections import Counter
 5 | from concurrent.futures import ThreadPoolExecutor, as_completed
 6 | 
 7 | class MathEvaluator:
 8 |     def __init__(self, calculator, num_threads=10, max_samples=100):
 9 |         self.calculator = calculator
10 |         self.num_threads = num_threads
11 |         self.max_samples = max_samples
12 | 
13 |     def evaluate_single_task(self, item):
14 |         task = item['task']
15 |         expected_solution = item['solution']
16 |         
17 |         iter_start = time.time()
18 |         result = self.calculator._forward_with_max_iter(task, max_iter=self.calculator.max_iterations)
19 |         elapsed = time.time() - iter_start
20 |         
21 |         correct = self.calculator._is_correct(result.solution, expected_solution)
22 |         return correct, elapsed
23 | 
24 |     def evaluate_on_dataset(self, dataset_path="math_dataset.json"):
25 |         start_time = time.time()
26 |         
27 |         with open(dataset_path) as f:
28 |             dataset = json.load(f)
29 |         
30 |         dataset = dataset[:self.max_samples] if hasattr(self, 'max_samples') else dataset[:100]
31 |         
32 |         results = {
33 |             "correct": 0,
34 |             "time": 0
35 |         }
36 |         
37 |         with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
38 |             futures = [
39 |                 executor.submit(self.evaluate_single_task, item)
40 |                 for item in dataset
41 |             ]
42 |             
43 |             for i, future in enumerate(tqdm.tqdm(as_completed(futures), total=len(futures), ncols=60), 1):
44 |                 correct, elapsed = future.result()
45 |                 results["correct"] += int(correct)
46 |                 results["time"] += elapsed
47 |                 
48 |                 if i % 100 == 0:
49 |                     print(f"\nProgress after {i} samples:")
50 |                     print(f"Correct: {results['correct']}/{i} ({results['correct']/i:.1%})")
51 |                     print(f"Time: {results['time']:.2f}s")
52 |                 
53 |         total_time = time.time() - start_time
54 |         results["accuracy"] = results["correct"] / len(dataset)
55 |         results["total_time"] = total_time
56 |         results["max_iter"] = self.calculator.max_iterations
57 |         
58 |         print("\nEvaluation Results:")
59 |         print(f"Max Iterations: {self.calculator.max_iterations}")
60 |         print(f"Correct Answers: {results['correct']}/{len(dataset)} ({results['accuracy']:.1%})")
61 |         print(f"Total Time: {results['total_time']:.2f}s")
62 |         
63 |         with open("math_calculator_benchmark.json", "w") as f:
64 |             json.dump(results, f, indent=2)
65 |             
66 |         print("\nBenchmark results saved to math_calculator_benchmark.json")
67 |         return results
68 | 


--------------------------------------------------------------------------------
/math_multiplication_optimizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import random
  5 | import numpy as np
  6 | from typing import List
  7 | from dspy.teleprompt import MIPROv2
  8 | from tqdm import tqdm
  9 | 
 10 | class MultiplicationSignature(dspy.Signature):
 11 |     """Solve multiplication problems step by step."""
 12 |     task = dspy.InputField(desc="multiplication task as a string")
 13 |     solution = dspy.OutputField(desc="final solution as a number")
 14 | 
 15 | class MultiplicationSolver(dspy.Module):
 16 |     def __init__(self):
 17 |         super().__init__()
 18 |         self.generate_answer = dspy.ChainOfThought(MultiplicationSignature)
 19 | 
 20 |     def forward(self, task):
 21 |         return self.generate_answer(task=task)
 22 | 
 23 | def generate_multiplication_dataset(num_samples=1000) -> List[dspy.Example]:
 24 |     """Generate multiplication problems with solutions."""
 25 |     dataset = []
 26 |     for _ in range(num_samples):
 27 |         # a = random.randint(1, 10000)
 28 |         # b = random.randint(1, 10000)
 29 |         max_num = int(1e5)
 30 |         a = random.randint(1, max_num)
 31 |         b = random.randint(1, max_num)
 32 |         task = f"{a} * {b}"
 33 |         solution = a * b
 34 |         dataset.append(dspy.Example(task=task, solution=solution).with_inputs('task'))
 35 |     return dataset
 36 | 
 37 | def evaluate_multiplication(example, prediction, trace=None):
 38 |     """Evaluate if predicted solution matches expected."""
 39 |     try:
 40 |         pred = float(prediction.solution)
 41 |         exp = float(example.solution)
 42 |         return int(abs(pred - exp) < 0.01)
 43 |     except:
 44 |         return 0
 45 | 
 46 | def optimize_multiplication_solver():
 47 |     # Configure language model
 48 |     lm = dspy.LM(model="deepseek/deepseek-chat", temperature=0.3, cache=False)
 49 |     dspy.settings.configure(lm=lm)
 50 | 
 51 |     # Generate dataset
 52 |     dataset = generate_multiplication_dataset(1000)
 53 |     trainset = dataset[:800]  # 80% training
 54 |     devset = dataset[800:]    # 20% validation
 55 | 
 56 |     # Initialize MIPROv2 optimizer
 57 |     teleprompter = MIPROv2(
 58 |         metric=evaluate_multiplication,
 59 |         num_candidates=3,
 60 |         num_threads=10,
 61 |         max_bootstrapped_demos=3,
 62 |         max_labeled_demos=4,
 63 |         # auto='light'
 64 |         auto='medium'
 65 |     )
 66 | 
 67 |     # Create and optimize solver
 68 |     student = MultiplicationSolver()
 69 |     optimized_solver = teleprompter.compile(
 70 |         student, 
 71 |         trainset=trainset, 
 72 |         valset=devset,
 73 |         requires_permission_to_run=False
 74 |     )
 75 | 
 76 |     # Evaluate on validation set
 77 |     correct = 0
 78 |     for example in devset:
 79 |         prediction = optimized_solver(example.task)
 80 |         correct += evaluate_multiplication(example, prediction)
 81 | 
 82 |     accuracy = correct / len(devset)
 83 |     print(f"Validation accuracy: {accuracy:.1%}")
 84 |     # vs unoptimized solver
 85 |     student = MultiplicationSolver()
 86 |     for example in devset:
 87 |         prediction = student(example.task)
 88 |         correct += evaluate_multiplication(example, prediction)
 89 | 
 90 |     accuracy = correct / len(devset)
 91 |     print(f"Unoptimized accuracy: {accuracy:.1%}")
 92 | 
 93 |     return optimized_solver
 94 | 
 95 | from dspy.evaluate import Evaluate
 96 | 
 97 | 
 98 | class LLMProgram(dspy.Module):
 99 |     def __init__(self):
100 |         super().__init__()
101 |         self.solver = dspy.ChainOfThought('task -> solution')
102 |         
103 |     def forward(self, task):
104 |         return self.solver(task=task)
105 | 
106 | def quick_optimize():
107 |     dspy.settings.configure(lm=dspy.LM(model="deepseek/deepseek-chat"))
108 |     dataset = [dspy.Example(task=f"{a}*{b}", solution=a*b).with_inputs('task') 
109 |               for a,b in zip(np.random.randint(1e5,1e6,1000), 
110 |                            np.random.randint(1e5,1e6,1000))]
111 |     train, val = dataset[:800], dataset[800:]  # 80/20 split
112 |     # metric = lambda e,p,trace=None: int(abs(float(p.solution)-float(e.solution))<0.01)
113 |     metric = lambda e,p,trace=None: int(abs(float(p.solution.replace(',',''))-float(e.solution))<0.01)
114 |     
115 |     llm_program = LLMProgram()
116 |     compiled_llm_program = MIPROv2(metric=metric, num_threads=100, auto='heavy').compile(
117 |         llm_program, trainset=train, valset=val)
118 |     
119 |     accuracy = sum(metric(e, compiled_llm_program(e.task)) for e in val) / len(val)
120 |     print(f"Optimized accuracy: {accuracy:.1%}")
121 | 
122 | 
123 | 
124 |     # Evaluate unoptimized solver
125 |     student = MultiplicationSolver()
126 |     correct = 0
127 |     for example in tqdm(val):
128 |         try:
129 |             prediction = student(example.task)
130 |             correct += metric(example, prediction)
131 |         except ValueError as e: print(e); pass
132 | 
133 |     accuracy = correct / len(val)
134 |     print(f"Unoptimized accuracy: {accuracy:.1%}")
135 |     evaluate = Evaluate(devset=train[:], metric=metric, num_threads=100, display_progress=True, 
136 |     display_table=True)
137 |     evaluate(student, devset=train[:])
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     # solver = optimize_multiplication_solver()
142 |     quick_optimize()
143 | 


--------------------------------------------------------------------------------
/reasoning_pipeline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import dspy
  3 | 
  4 | # Step 1: Configure the LM to use DeepSeek with temperature=1 and no caching
  5 | lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1, cache=False)  # Use DeepSeek as the LM
  6 | dspy.settings.configure(lm=lm)
  7 | 
  8 | 
  9 | action_list = ['reasoning', 'terminate']
 10 | # Step 2: Define the Signature for Core Reasoning
 11 | class ReasoningSignature(dspy.Signature):
 12 |     context = dspy.InputField(desc="The context to reason about")
 13 |     objective = dspy.InputField(desc="The objective to achieve")
 14 |     reasoning = dspy.OutputField(desc="The reasoning process including step-by-step calculations")
 15 |     reasoning_output = dspy.OutputField(
 16 |         desc="The final output of the reasoning process. If no specific output, repeat the reasoning conclusion.",
 17 |         optional=True
 18 |     )
 19 |     informal_proof = dspy.OutputField(
 20 |         desc="A numbered list of steps for the informal proof. If no proof needed, summarize the reasoning steps.",
 21 |         optional=True
 22 |     )
 23 | 
 24 | # Define Signature for Analysis
 25 | class RequirementsSignature(dspy.Signature):
 26 |     context = dspy.InputField(desc="The context of the reasoning")
 27 |     objective = dspy.InputField(desc="The objective to achieve")
 28 |     current_requirements = dspy.InputField(desc="List of current requirements to achieve the objective")
 29 |     new_requirements = dspy.OutputField(
 30 |         desc="List of new requirements to add to achieve the objective. Return an empty list if no new requirements are needed.",
 31 |         default=[]
 32 |     )
 33 |     unnecessary_requirements = dspy.OutputField(
 34 |         desc="List of requirements that are no longer needed to achieve the objective. Return an empty list if no requirements should be removed.",
 35 |         default=[]
 36 |     )
 37 |     action = dspy.OutputField(
 38 |         desc="The action to take: 'add_requirements' if new requirements are needed, 'remove_requirements' if requirements should be removed, or 'stop' if requirements are complete",
 39 |         default="stop"
 40 |     )
 41 | 
 42 | class ReasoningAnalysisSignature(dspy.Signature):
 43 |     context = dspy.InputField(desc="The context of the reasoning")
 44 |     reasoning = dspy.InputField(desc="The reasoning process to analyze")
 45 |     reasoning_output = dspy.InputField(desc="The output of the reasoning process")
 46 |     informal_proof = dspy.InputField(desc="The numbered list of proof steps to analyze")
 47 |     
 48 |     proof_line_analysis = dspy.OutputField(
 49 |         desc="Detailed analysis of each proof line, checking if it makes logical sense and is mathematically correct"
 50 |     )
 51 |     
 52 |     objective_achieved_analysis = dspy.OutputField(
 53 |         desc="Analysis of whether the objective was fully achieved"
 54 |     )
 55 |     
 56 |     objective_achieved_confidence = dspy.OutputField(
 57 |         desc="Confidence score from 1-10 where 1 means extremely sure objective was not achieved and 10 means objective was definitely achieved"
 58 |     )
 59 |     
 60 |     is_valid_reasoning = dspy.OutputField(
 61 |         desc="True if the reasoning in the input is valid and reaches the correct conclusion"
 62 |     )
 63 |     
 64 |     action = dspy.OutputField(
 65 |         desc="The action to take, must be either 'reasoning' or 'terminate'"
 66 |     )
 67 |     
 68 | 
 69 | # Step 3: Create a Module with the Signature
 70 | class RequirementsGenerator(dspy.Module):
 71 |     def __init__(self):
 72 |         super().__init__()
 73 |         self.generate_requirements = dspy.ChainOfThought(RequirementsSignature)
 74 | 
 75 |     def forward(self, context, objective, current_requirements):
 76 |         result = self.generate_requirements(
 77 |             context=context,
 78 |             objective=objective,
 79 |             current_requirements=current_requirements
 80 |         )
 81 |         return result
 82 | 
 83 | class ActionReasoning(dspy.Module):
 84 |     def __init__(self):
 85 |         super().__init__()
 86 |         # Use ChainOfThought for core reasoning
 87 |         self.generate_action = dspy.ChainOfThought(ReasoningSignature)
 88 |         # Separate module for analysis
 89 |         self.analyze_reasoning = dspy.ChainOfThought(ReasoningAnalysisSignature)
 90 |         # Module for requirements generation
 91 |         self.requirements_generator = RequirementsGenerator()
 92 | 
 93 |     def forward(self, context, objective):
 94 |         # First generate the reasoning
 95 |         reasoning_result = self.generate_action(context=context, objective=objective)
 96 |         
 97 |         # Handle missing fields
 98 |         reasoning = getattr(reasoning_result, "reasoning", "No reasoning provided")
 99 |         reasoning_output = getattr(reasoning_result, "reasoning_output", reasoning)
100 |         informal_proof = getattr(reasoning_result, "informal_proof", reasoning)
101 |         
102 |         # Then analyze the reasoning and proof
103 |         analysis_result = self.analyze_reasoning(
104 |             context=context,
105 |             reasoning=reasoning,
106 |             reasoning_output=reasoning_output,
107 |             informal_proof=informal_proof
108 |         )
109 |         
110 |         # Handle missing analysis fields
111 |         objective_achieved_analysis = getattr(analysis_result, "objective_achieved_analysis", "No analysis provided")
112 |         objective_achieved_confidence = getattr(analysis_result, "objective_achieved_confidence", 5)
113 |         is_valid_reasoning = getattr(analysis_result, "is_valid_reasoning", "unknown")
114 |         action = getattr(analysis_result, "action", "reasoning")
115 |         proof_line_analysis = getattr(analysis_result, "proof_line_analysis", "No proof line analysis provided")
116 |         
117 |         combined = {
118 |             "reasoning": reasoning,
119 |             "reasoning_output": reasoning_output,
120 |             "informal_proof": informal_proof,
121 |             "objective_achieved_analysis": objective_achieved_analysis,
122 |             "objective_achieved_confidence": objective_achieved_confidence,
123 |             "is_valid_reasoning": is_valid_reasoning,
124 |             "action": action,
125 |             "proof_line_analysis": proof_line_analysis
126 |         }
127 |         return dspy.Prediction(**combined)
128 | 
129 | # Step 4: Create an Instance of the Pipeline
130 | reasoning_pipeline = ActionReasoning()
131 | 
132 | def generate_requirements(context, objective):
133 |     """Iteratively generate and refine requirements for achieving an objective"""
134 |     requirements = []
135 |     iteration = 1
136 |     max_iterations = 10
137 |     
138 |     while iteration <= max_iterations:
139 |         # If we hit max iterations, reset completely and try again
140 |         if iteration == max_iterations:
141 |             print("\nWarning: Reached max iterations. Resetting requirements and starting fresh.")
142 |             requirements = []
143 |             iteration = 1
144 |             continue
145 |             
146 |         print(f"\n--- Requirements Iteration {iteration} ---")
147 |         print("Current Requirements:")
148 |         for i, req in enumerate(requirements, 1):
149 |             print(f"{i}. {req}")
150 |         
151 |         # Generate new requirements
152 |         result = RequirementsGenerator()(
153 |             context=context,
154 |             objective=objective,
155 |             current_requirements=requirements
156 |         )
157 |         
158 |         # Process new requirements
159 |         if result.new_requirements:
160 |             if isinstance(result.new_requirements, str):
161 |                 # Split string into list items and filter out non-requirement statements
162 |                 new_reqs = [
163 |                     req.strip() for req in result.new_requirements.split('\n') 
164 |                     if req.strip() and not req.lower().startswith(('none', 'no new'))
165 |                 ]
166 |             else:
167 |                 # Filter list items for non-requirement statements
168 |                 new_reqs = [
169 |                     req for req in result.new_requirements 
170 |                     if not str(req).lower().startswith(('none', 'no new'))
171 |                 ]
172 |                 
173 |             if new_reqs:  # Only add if we have actual requirements
174 |                 print("\nAdding new requirements:")
175 |                 for req in new_reqs:
176 |                     print(f"- {req}")
177 |                     requirements.append(req)
178 |         
179 |         # Process unnecessary requirements
180 |         if result.unnecessary_requirements:
181 |             if isinstance(result.unnecessary_requirements, str):
182 |                 # Split string into list items
183 |                 remove_reqs = [req.strip() for req in result.unnecessary_requirements.split('\n') if req.strip()]
184 |             else:
185 |                 remove_reqs = result.unnecessary_requirements
186 |                 
187 |             print("\nRemoving unnecessary requirements:")
188 |             for req in remove_reqs:
189 |                 print(f"- {req}")
190 |                 requirements = [r for r in requirements if r not in remove_reqs]
191 |         
192 |         # Check if we should stop
193 |         if result.action.lower().strip() == "stop":
194 |             print("\nRequirements generation complete")
195 |             break
196 |             
197 |         iteration += 1
198 |     
199 |     print("\nFinal Requirements:")
200 |     for i, req in enumerate(requirements, 1):
201 |         print(f"{i}. {req}")
202 |     
203 |     return requirements
204 | 
205 | def track_analysis(analysis_history, analysis, confidence):
206 |     """Track analysis results as a list of tuples"""
207 |     try:
208 |         # Extract first digit if confidence is a string
209 |         if isinstance(confidence, str):
210 |             confidence = ''.join(filter(str.isdigit, confidence)) or '5'
211 |         confidence_int = int(confidence)
212 |         # Clamp to 1-10 range
213 |         confidence_int = max(1, min(10, confidence_int))
214 |         analysis_history.append((analysis, confidence_int))
215 |     except (ValueError, TypeError):
216 |         # Default to medium confidence if parsing fails
217 |         analysis_history.append((analysis, 5))
218 |     return analysis_history
219 | 
220 | def run_reasoning_pipeline(initial_context, initial_objective, callback=None):
221 |     # Generate requirements first with retry logic
222 |     max_retries = 3
223 |     requirements = []
224 |     
225 |     for attempt in range(max_retries):
226 |         requirements = generate_requirements(initial_context, initial_objective)
227 |         print(f"\nFinal Requirements: {requirements}")
228 |         
229 |         # If we got requirements, break
230 |         if requirements:
231 |             break
232 |             
233 |         print(f"\nWarning: Empty requirements list on attempt {attempt + 1}. Retrying...")
234 |     
235 |     # If still empty after retries, use a default requirement
236 |     if not requirements:
237 |         print("\nWarning: Could not generate requirements after multiple attempts. Using default.")
238 |         requirements = ["Use the given numbers and operations to achieve the objective"]
239 |     
240 |     # Initialize context and analysis history
241 |     requirements_str = "\n".join(f"- {req}" for req in requirements)
242 |     initial_context_with_reqs = f"{initial_context.strip()}\n\nRequirements:\n{requirements_str}"
243 |     context_history = [initial_context_with_reqs]
244 |     analysis_history = []
245 |     
246 |     # Extract question and hint if they exist
247 |     context_lines = initial_context.split('\n')
248 |     question = next((line for line in context_lines if line.startswith("Final Question:")), initial_context)
249 |     hint = next((line for line in context_lines if line.startswith("Hint:")), "")
250 |     
251 |     # Create display context for debugging
252 |     display_context = f"{question}\n{hint}\n\nRequirements:\n{requirements_str}" if hint else f"{question}\n\nRequirements:\n{requirements_str}"
253 |     objective = initial_objective
254 |     iteration = 1
255 |     
256 |     while True:
257 |         print(f"\n--- Reasoning Iteration {iteration} ---")
258 |         print(f"Context: {display_context}")
259 |         print(f"Objective: {objective}")
260 |         
261 |         # Get current context from history
262 |         current_context = "\n\n".join(context_history)
263 |         
264 |         # Run the reasoning pipeline
265 |         result = reasoning_pipeline(context=current_context, objective=objective)
266 |         
267 |         # Track analysis and call callback if provided
268 |         analysis_history = track_analysis(analysis_history, 
269 |             result.objective_achieved_analysis,
270 |             result.objective_achieved_confidence)
271 |             
272 |         if callback:
273 |             callback(iteration, current_context, objective, result)
274 |         
275 |         # Validate and process the action
276 |         action = result.action.lower().strip()
277 |         print("Reasoning Process:", result.reasoning)
278 |         print("Reasoning Output:", result.reasoning_output)
279 |         print("\nDetailed Informal Proof Steps:")
280 |         if isinstance(result.informal_proof, str):
281 |             # Convert string proof to list if needed
282 |             proof_steps = [step.strip() for step in result.informal_proof.split('\n') if step.strip()]
283 |         else:
284 |             proof_steps = result.informal_proof
285 |             
286 |         for i, step in enumerate(proof_steps, 1):
287 |             print(f"{i}. {step}")
288 |             
289 |         print("\nProof Line Analysis:")
290 |         print(result.proof_line_analysis)
291 |         print("\nObjective Achievement Analysis:")
292 |         print(f"{result.objective_achieved_analysis} (Confidence: {result.objective_achieved_confidence}/10)")
293 |         print("\nAnalysis History:")
294 |         for i, (analysis, confidence) in enumerate(analysis_history, 1):
295 |             print(f"Iteration {i}: {analysis} (Confidence: {confidence}/10)")
296 |         
297 |         print("action:", action)
298 |         
299 |         # Only accept termination if explicitly told to
300 |         if "terminate" in action or "no further" in action:
301 |             if result.is_valid_reasoning.lower().strip() in ["true", "yes", "correct"]:
302 |                 print("Decision: Terminate reasoning process with valid solution")
303 |                 break
304 |             else:
305 |                 print("Decision: Invalid solution found - continuing reasoning")
306 |                 objective = "The previous solution was mathematically incorrect. Try a different approach."
307 |                 continue
308 |             
309 |         print("Decision: Continue reasoning")
310 |         
311 |         # Update context history with full reasoning details
312 |         context_history.append(f"""
313 | --- Reasoning Iteration {iteration} ---
314 | Context: {display_context}
315 | Objective: {objective}
316 | Reasoning Process: {result.reasoning}
317 | Reasoning Output: {result.reasoning_output}
318 | Objective Analysis: {result.objective_achieved_analysis} (Confidence: {result.objective_achieved_confidence}/10)
319 | """.strip())
320 |         
321 |         # Update context for next iteration with full history
322 |         context = "\n\n".join(context_history)
323 |         objective = "Continue reasoning based on previous analysis"
324 |         iteration += 1
325 | 
326 | # Example usage
327 | initial_context = """
328 | How can you solve the Game of 24 using the numbers 3,
329 | 4, 5, and 6?
330 | Let's think step by step:
331 | 1. We need to use basic arithmetic operations (+, -, *, /) to 
332 | get 24.
333 | 2. One possible solution is: (3 * 4) + (5 + 6) = 24."""
334 | initial_objective = "Generate a new solution using the same numbers."
335 | 
336 | if __name__ == "__main__":
337 |     run_reasoning_pipeline(initial_context, initial_objective)
338 | 


--------------------------------------------------------------------------------
/researcher.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import dspy
  3 | from typing import List, Dict, Optional
  4 | from serper_search import SerperSearch
  5 | 
  6 | class DecideNextActionSignature(dspy.Signature):
  7 |     """Decide the next action to take based on current information"""
  8 |     search_results = dspy.InputField(desc="All search results from previous searches")
  9 |     current_text = dspy.InputField(desc="The current text being worked on")
 10 |     downloaded_sites = dspy.InputField(desc="All previously downloaded website contents")
 11 |     reasoning = dspy.OutputField(desc="Reasoning for the chosen action")
 12 |     action = dspy.OutputField(desc="Next action to take: 'search', 'rewrite', or 'download'")
 13 |     action_reasoning = dspy.OutputField(desc="Reasoning for the chosen action")
 14 | 
 15 | class RewriteTextSignature(dspy.Signature):
 16 |     """Rewrite the current text using all available information"""
 17 |     all_texts = dspy.InputField(desc="All texts including search results and downloaded content")
 18 |     current_text = dspy.InputField(desc="The current text being rewritten")
 19 |     reasoning = dspy.OutputField(desc="Reasoning for the rewrite")
 20 |     rewritten_text = dspy.OutputField(desc="The new rewritten text")
 21 |     rewrite_reasoning = dspy.OutputField(desc="Explanation of changes made")
 22 | 
 23 | class EvaluateTextSignature(dspy.Signature):
 24 |     """Evaluate the quality of the rewritten text"""
 25 |     original_text = dspy.InputField(desc="The original text before rewriting")
 26 |     rewritten_text = dspy.InputField(desc="The rewritten text to evaluate")
 27 |     evaluation_reasoning = dspy.OutputField(desc="Detailed reasoning for the evaluation score")
 28 |     evaluation = dspy.OutputField(desc="Evaluation of text quality on a scale from 1-10")
 29 |     improvement_suggestions = dspy.OutputField(desc="Suggestions for further improving the text")
 30 | 
 31 | class GenerateSearchQuerySignature(dspy.Signature):
 32 |     """Generate an effective search query based on research needs"""
 33 |     current_text = dspy.InputField(desc="The current text being researched")
 34 |     research_goal = dspy.InputField(desc="The overall goal of the research")
 35 |     search_results = dspy.InputField(desc="Previous search results", default="")
 36 |     reasoning = dspy.OutputField(desc="Reasoning for the search query")
 37 |     search_query = dspy.OutputField(desc="The search query to use")
 38 |     query_type = dspy.OutputField(
 39 |         desc="Type of query: 'general' for broad searches, 'specific' for focused searches",
 40 |         default="general"
 41 |     )
 42 | 
 43 | class Researcher(dspy.Module):
 44 |     def __init__(self, max_iterations: int = 10, max_searches: int = 3):
 45 |         super().__init__()
 46 |         self.forward = self.run_research  # Map forward to run_research
 47 |         
 48 |         # Configure DeepSeek as the language model with higher temperature for more creativity
 49 |         self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.5, cache=False)
 50 |         dspy.settings.configure(lm=self.lm)
 51 |         
 52 |         # Initialize search client
 53 |         self.search_client = SerperSearch()
 54 |         
 55 |         self.max_iterations = max_iterations
 56 |         self.max_searches = max_searches
 57 |         self.search_count = 0
 58 |         self.research_goal = ""
 59 |         
 60 |         # Initialize the DSPy modules
 61 |         self.decide_action = dspy.ChainOfThought(DecideNextActionSignature)
 62 |         self.rewrite_text = dspy.ChainOfThought(RewriteTextSignature)
 63 |         self.evaluate_text = dspy.ChainOfThought(EvaluateTextSignature)
 64 |         self.generate_search_query = dspy.ChainOfThought(GenerateSearchQuerySignature)
 65 |         
 66 |         # State tracking
 67 |         self.search_results = []
 68 |         self.downloaded_sites = []
 69 |         self.all_texts = []
 70 |         self.current_text = ""
 71 |         self.evaluation_history = []
 72 | 
 73 |     def add_search_results(self, results: List[Dict]):
 74 |         """Add new search results to the researcher's knowledge"""
 75 |         self.search_results.extend(results)
 76 |         self.all_texts.extend([r['snippet'] for r in results])
 77 | 
 78 |     def add_downloaded_site(self, content: str):
 79 |         """Add downloaded website content to the researcher's knowledge"""
 80 |         self.downloaded_sites.append(content)
 81 |         self.all_texts.append(content)
 82 | 
 83 |     def decide_next_action(self) -> str:
 84 |         """Determine the next action to take"""
 85 |         if self.search_count >= self.max_searches:
 86 |             return 'rewrite'
 87 |             
 88 |         result = self.decide_action(
 89 |             search_results=self.search_results,
 90 |             current_text=self.current_text,
 91 |             downloaded_sites=self.downloaded_sites
 92 |         )
 93 |         return result.action.lower()
 94 | 
 95 |     def rewrite_current_text(self) -> str:
 96 |         """Rewrite the current text using all available information"""
 97 |         result = self.rewrite_text(
 98 |             all_texts=self.all_texts,
 99 |             current_text=self.current_text
100 |         )
101 |         return result.rewritten_text
102 | 
103 |     def evaluate_current_text(self) -> Dict:
104 |         """Evaluate the quality of the current text"""
105 |         if not self.current_text:
106 |             return {
107 |                 'evaluation': 0,
108 |                 'evaluation_reasoning': 'No text to evaluate',
109 |                 'improvement_suggestions': 'Start with initial text'
110 |             }
111 |             
112 |         result = self.evaluate_text(
113 |             original_text=self.all_texts[0] if self.all_texts else "",
114 |             rewritten_text=self.current_text
115 |         )
116 |         try:
117 |             # Handle different evaluation score formats
118 |             if isinstance(result.evaluation, str):
119 |                 # Try to extract number from string
120 |                 import re
121 |                 numbers = re.findall(r'\d+', result.evaluation)
122 |                 if numbers:
123 |                     evaluation_score = float(numbers[0])
124 |                 else:
125 |                     evaluation_score = 1.0
126 |             else:
127 |                 evaluation_score = float(result.evaluation)
128 |                 
129 |             # Clamp score between 1-10 and round to nearest integer
130 |             evaluation_score = max(1.0, min(10.0, evaluation_score))
131 |             evaluation_score = round(evaluation_score)
132 |                 
133 |             return {
134 |                 'evaluation': evaluation_score,
135 |                 'evaluation_reasoning': result.evaluation_reasoning,
136 |                 'improvement_suggestions': result.improvement_suggestions
137 |             }
138 |         except (ValueError, TypeError):
139 |             # Default to low score if conversion fails
140 |             return {
141 |                 'evaluation': 1,
142 |                 'evaluation_reasoning': "Invalid evaluation score format",
143 |                 'improvement_suggestions': "Ensure evaluation returns a valid number between 1-10"
144 |             }
145 | 
146 |     def generate_search_terms(self) -> str:
147 |         """Generate effective search terms based on current research state"""
148 |         result = self.generate_search_query(
149 |             current_text=self.current_text,
150 |             research_goal=self.research_goal,
151 |             search_results=self.search_results
152 |         )
153 |         return result.search_query
154 | 
155 |     def run_research(self, initial_text: str) -> Dict:
156 |         """Run the research process with iteration control"""
157 |         if not initial_text:
158 |             raise ValueError("Initial text cannot be empty")
159 |             
160 |         self.current_text = initial_text
161 |         self.all_texts = [initial_text]
162 |         self.research_goal = initial_text  # Use initial text as research goal
163 |         
164 |         for iteration in range(self.max_iterations):
165 |             print(f"\n--- Research Iteration {iteration + 1} ---")
166 |             
167 |             # Decide next action
168 |             action = self.decide_next_action()
169 |             print(f"Action: {action}")
170 |             
171 |             if action == 'search':
172 |                 if self.search_count >= self.max_searches:
173 |                     print("Max searches reached, switching to rewrite")
174 |                     action = 'rewrite'
175 |                 else:
176 |                     self.search_count += 1
177 |                     # Generate optimized search terms
178 |                     search_term = self.generate_search_terms()
179 |                     print(f"Performing search for: {search_term}...")
180 |                     
181 |                     try:
182 |                         # Perform actual search with error handling
183 |                         search_results = self.search_client.search(search_term)
184 |                         if search_results:
185 |                             self.add_search_results(search_results)
186 |                         else:
187 |                             print("Warning: No search results found")
188 |                     except Exception as e:
189 |                         print(f"Search error: {str(e)}")
190 |                         continue
191 |                     
192 |                     continue
193 |                     
194 |             elif action == 'download':
195 |                 # Note: Actual download implementation would go here
196 |                 print("Downloading site...")
197 |                 continue
198 |                 
199 |             elif action == 'rewrite':
200 |                 # Rewrite the text
201 |                 new_text = self.rewrite_current_text()
202 |                 print("\nRewritten Text:")
203 |                 print(new_text)
204 |                 
205 |                 # Evaluate the new text
206 |                 evaluation = self.evaluate_current_text()
207 |                 print("\nEvaluation:")
208 |                 print(f"Score: {evaluation['evaluation']}/10")
209 |                 print(f"Reasoning: {evaluation['evaluation_reasoning']}")
210 |                 print(f"Suggestions: {evaluation['improvement_suggestions']}")
211 |                 
212 |                 # Update state
213 |                 self.current_text = new_text
214 |                 self.all_texts.append(new_text)
215 |                 self.evaluation_history.append(evaluation)
216 |                 
217 |                 # Check if we should terminate
218 |                 if evaluation['evaluation'] >= 9:
219 |                     print("\nHigh quality text achieved, stopping research")
220 |                     break
221 |                     
222 |             else:
223 |                 print(f"Unknown action: {action}, defaulting to rewrite")
224 |                 action = 'rewrite'
225 |                 
226 |         return {
227 |             'final_text': self.current_text,
228 |             'evaluation_history': self.evaluation_history,
229 |             'search_count': self.search_count,
230 |             'iterations': iteration + 1
231 |         }
232 | 
233 | if __name__ == "__main__":
234 |     # Example usage
235 |     initial_text = "Write a comprehensive overview of recent developments in AI research"
236 |     
237 |     researcher = Researcher(max_iterations=10, max_searches=3)
238 |     result = researcher.run_research(initial_text)
239 |     
240 |     print("\nFinal Result:")
241 |     print(result['final_text'])
242 |     print("\nEvaluation History:")
243 |     for i, eval in enumerate(result['evaluation_history'], 1):
244 |         print(f"Iteration {i}: Score {eval['evaluation']}/10")
245 | 


--------------------------------------------------------------------------------
/researcher_optimizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import dspy
  3 | from typing import List, Dict
  4 | from researcher import Researcher
  5 | from dspy.teleprompt import MIPROv2
  6 | from dspy import Example
  7 | 
  8 | # Dataset of task prompts for optimization
  9 | RESEARCH_TASKS = [
 10 |     {
 11 |         "input": "Write a comprehensive overview of recent developments in AI research",
 12 |     },
 13 |     {
 14 |         "input": "Explain the latest breakthroughs in quantum computing",
 15 |     },
 16 |     {
 17 |         "input": "Compare different approaches to climate change mitigation",
 18 |     },
 19 |     {
 20 |         "input": "Analyze the impact of social media on mental health",
 21 |     },
 22 |     {
 23 |         "input": "Describe the evolution of renewable energy technologies",
 24 |     },
 25 |     {
 26 |         "input": "Evaluate the effectiveness of different education systems worldwide",
 27 |     },
 28 |     {
 29 |         "input": "Explain the causes and effects of inflation in modern economies",
 30 |     },
 31 |     {
 32 |         "input": "Discuss the future of space exploration",
 33 |     },
 34 |     {
 35 |         "input": "Analyze the role of AI in healthcare diagnostics",
 36 |     },
 37 |     {
 38 |         "input": "Compare traditional and modern architectural styles",
 39 |     }
 40 | ]
 41 | 
 42 | 
 43 | def create_dataset() -> List[Example]:
 44 |     """Create dataset from predefined research tasks with validation"""
 45 |     dataset = []
 46 |     for task in RESEARCH_TASKS:
 47 |         example = Example(input=task["input"]).with_inputs('input')
 48 |         dataset.append(example)
 49 |     return dataset
 50 | 
 51 | class ResearcherOptimizer:
 52 |     def __init__(self, max_iterations: int = 10, max_searches: int = 3):
 53 |         self.max_iterations = max_iterations
 54 |         self.max_searches = max_searches
 55 |         self.dataset = create_dataset()
 56 |         
 57 |         # Configure DeepSeek as the language model
 58 |         self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.0, cache=False)
 59 |         dspy.settings.configure(lm=self.lm)
 60 |         
 61 |     def evaluate_researcher(self, researcher: Researcher, example: Example) -> float:
 62 |         """Evaluate researcher performance on a single example"""
 63 |         result = researcher(example.input)
 64 |         final_text = result['final_text']
 65 |         
 66 |         # Simple evaluation metric (could be enhanced)
 67 |         score = self._calculate_similarity(final_text, example.output)
 68 |         return score
 69 |     
 70 |     def _calculate_similarity(self, text1: str, text2: str) -> float:
 71 |         """Improved text similarity metric using TF-IDF cosine similarity"""
 72 |         from sklearn.feature_extraction.text import TfidfVectorizer
 73 |         from sklearn.metrics.pairwise import cosine_similarity
 74 |         
 75 |         # Handle empty text cases
 76 |         if not text1 or not text2:
 77 |             return 0.0
 78 |             
 79 |         # Create TF-IDF vectors
 80 |         vectorizer = TfidfVectorizer()
 81 |         tfidf_matrix = vectorizer.fit_transform([text1, text2])
 82 |         
 83 |         # Calculate cosine similarity
 84 |         similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
 85 |         return float(similarity)
 86 |     
 87 |     def optimize(self, num_candidates: int = 5, num_iterations: int = 3) -> Researcher:
 88 |         """Optimize the researcher using MIPRO"""
 89 |         # Define the teleprompter with MIPROv2
 90 |         teleprompter = MIPROv2(
 91 |             metric=self.evaluate_researcher,
 92 |             num_candidates=num_candidates,
 93 |             num_threads=1,  # MIPROv2 uses internal parallelization
 94 |             teacher_settings=dict(lm=self.lm),
 95 |             init_temperature=1.0,
 96 |             prompt_model=self.lm,
 97 |             task_model=self.lm,
 98 |             auto='medium',
 99 |             track_stats=True
100 |         )
101 |         
102 |         # Create initial researcher
103 |         base_researcher = Researcher(
104 |             max_iterations=self.max_iterations,
105 |             max_searches=self.max_searches
106 |         )
107 |         
108 |         # Run optimization
109 |         optimized_researcher = teleprompter.compile(
110 |             base_researcher,
111 |             trainset=self.dataset,
112 |             num_trials=num_iterations,
113 |             requires_permission_to_run=False  # Disable confirmation prompt
114 |         )
115 |         
116 |         return optimized_researcher
117 | 
118 | if __name__ == "__main__":
119 |     optimizer = ResearcherOptimizer()
120 |     
121 |     print("Starting researcher optimization...")
122 |     optimized_researcher = optimizer.optimize()
123 |     
124 |     print("\nOptimization complete. Testing optimized researcher:")
125 |     test_task = RESEARCH_TASKS[0]
126 |     result = optimized_researcher.run_research(test_task["input"])
127 |     
128 |     print("\nTest Task Input:", test_task["input"])
129 |     print("\nGenerated Output:", result['final_text'])
130 |     print("\nEvaluation History:")
131 |     for i, eval in enumerate(result['evaluation_history'], 1):
132 |         print(f"Iteration {i}: Score {eval['evaluation']}/10")
133 | 


--------------------------------------------------------------------------------
/residual_pipeline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import json
  5 | import time
  6 | from typing import List, Optional
  7 | from concurrent.futures import ThreadPoolExecutor, as_completed
  8 | from tqdm import tqdm
  9 | 
 10 | class SearchReplaceModule(dspy.Module):
 11 |     def __init__(self):
 12 |         super().__init__()
 13 |         self.process = dspy.ChainOfThought('input -> search_block, replace_block')
 14 |         
 15 |     def forward(self, input_text: str) -> str:
 16 |         result = self.process(input=input_text)
 17 |         if not hasattr(result, 'search_block') or not hasattr(result, 'replace_block'):
 18 |             return input_text
 19 |         return input_text.replace(result.search_block, result.replace_block)
 20 | 
 21 | class SearchReplacePipeline(dspy.Module):
 22 |     def __init__(self, num_layers: int = 3):
 23 |         super().__init__()
 24 |         self.layers = [SearchReplaceModule() for _ in range(num_layers)]
 25 | 
 26 |     def forward(self, task: str) -> str:
 27 |         current = task
 28 |         for layer in self.layers:
 29 |             current = layer(current)
 30 |         return current
 31 | 
 32 | 
 33 | 
 34 | class SearchReplaceIterModule(dspy.Module):
 35 |     def __init__(self):
 36 |         super().__init__()
 37 |         self.process = dspy.ChainOfThought('input, iteration -> search_block, replace_block')
 38 |         
 39 |     def forward(self, input_text: str, iteration: int) -> str:
 40 |         result = self.process(input=input_text, iteration=iteration)
 41 |         if not hasattr(result, 'search_block') or not hasattr(result, 'replace_block'):
 42 |             return input_text
 43 |         return input_text.replace(result.search_block, result.replace_block)
 44 | 
 45 | class SearchReplaceIterPipeline(dspy.Module):
 46 |     def __init__(self, num_iters: int = 10):
 47 |         super().__init__()
 48 |         self.layers = [SearchReplaceIterModule() for _ in range(num_iters)]
 49 | 
 50 |     def forward(self, task: str) -> str:
 51 |         current = task
 52 |         # for layer in self.layers:
 53 |         for iteration, layer in enumerate(self.layers):
 54 |             # current = layer(current)
 55 |             current = layer(current, iteration)
 56 |         return current
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 |         # try:
 64 |             # # Try to evaluate the final expression
 65 |             # return str(eval(current))
 66 |         # except:
 67 |             # return current
 68 | 
 69 | def evaluate_pipeline(
 70 |     dataset_path: str = "math_dataset.json", 
 71 |     num_threads: int = 10, 
 72 |     num_layers: int = 10,
 73 |     model: str = "deepseek/deepseek-chat",
 74 |     temperature: float = 0.3
 75 | ) -> float:
 76 |     print(f"\nEvaluating SearchReplace Pipeline with {num_layers} layers using {model}...")
 77 |     start_time = time.time()
 78 |     
 79 |     with open(dataset_path) as f:
 80 |         dataset = json.load(f)
 81 |     
 82 |     lm = dspy.LM(model=model, temperature=temperature, cache=False)
 83 |     dspy.settings.configure(lm=lm)
 84 |     pipeline = SearchReplacePipeline(num_layers=num_layers)
 85 |     
 86 |     correct = 0
 87 |     total_tasks = min(len(dataset), 100)
 88 |     results = []
 89 |     
 90 |     def evaluate_task(task_data):
 91 |         try:
 92 |             task = task_data['task']
 93 |             expected = float(task_data['solution'])
 94 |             
 95 |             predicted = pipeline(task)
 96 |             predicted_num = float(predicted)
 97 |             
 98 |             is_correct = abs(predicted_num - expected) < 0.01
 99 |             return {
100 |                 'task': task,
101 |                 'predicted': predicted,
102 |                 'expected': expected,
103 |                 'correct': is_correct
104 |             }
105 |         except (ValueError, TypeError) as e:
106 |             return {
107 |                 'task': task_data['task'],
108 |                 'error': str(e),
109 |                 'correct': False
110 |             }
111 |     
112 |     with ThreadPoolExecutor(max_workers=num_threads) as executor:
113 |         futures = [
114 |             executor.submit(evaluate_task, task_data)
115 |             for task_data in dataset[:total_tasks]
116 |         ]
117 |         
118 |         with tqdm(total=total_tasks, desc="Evaluating") as pbar:
119 |             for future in as_completed(futures):
120 |                 result = future.result()
121 |                 results.append(result)
122 |                 if result.get('correct', False):
123 |                     correct += 1
124 |                 pbar.update(1)
125 |                 
126 |                 # Display running accuracy
127 |                 current_accuracy = correct / len(results)
128 |                 pbar.set_postfix({'accuracy': f'{current_accuracy:.1%}'})
129 |     
130 |     accuracy = correct / total_tasks
131 |     elapsed = time.time() - start_time
132 |     
133 |     print("\nEvaluation Results:")
134 |     print(f"Accuracy: {accuracy:.1%}")
135 |     print(f"Time taken: {elapsed:.1f}s")
136 |     print(f"Tasks evaluated: {total_tasks}")
137 |     
138 |     # Display some example predictions
139 |     print("\nExample predictions:")
140 |     for i, result in enumerate(results[:5]):
141 |         print(f"\nTask {i+1}:")
142 |         print(f"Input: {result['task']}")
143 |         if 'error' in result:
144 |             print(f"Error: {result['error']}")
145 |         else:
146 |             print(f"Predicted: {result['predicted']}")
147 |             print(f"Expected: {result['expected']}")
148 |             print(f"Correct: {result['correct']}")
149 |     
150 |     return accuracy
151 | 
152 | if __name__ == "__main__":
153 |     evaluate_pipeline(num_layers=3)
154 | 


--------------------------------------------------------------------------------
/residual_pipeline_optimizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import json
  5 | import time
  6 | import numpy as np
  7 | from typing import Dict, List, Tuple
  8 | from concurrent.futures import ThreadPoolExecutor
  9 | from tqdm import tqdm
 10 | from residual_pipeline import SearchReplacePipeline, evaluate_pipeline
 11 | from residual_pipeline import SearchReplaceIterPipeline
 12 | 
 13 | PIPELINE_TYPE_STANDARD = "standard"
 14 | PIPELINE_TYPE_ITER = "iter"
 15 | 
 16 | class PipelineOptimizer:
 17 |     def __init__(self, pipeline_type: str = PIPELINE_TYPE_STANDARD):
 18 |         self.best_config = None
 19 |         self.best_accuracy = 0.0
 20 |         self.results_history = []
 21 |         self.pipeline_type = pipeline_type
 22 |         self.dataset_path = "math_dataset.json"
 23 |         
 24 |     def _create_teleprompter(self, metric, optimizer_type: str = "bfs"):
 25 |         """Create and configure teleprompter"""
 26 |         config = self._get_default_config()
 27 |         if optimizer_type == "mipro":
 28 |             return dspy.teleprompt.MIPROv2(
 29 |                 metric=metric,
 30 |                 num_candidates=config['num_candidates'],
 31 |                 num_threads=config['num_threads'],
 32 |                 max_bootstrapped_demos=config['max_bootstrapped_demos'],
 33 |                 max_labeled_demos=config['max_labeled_demos'],
 34 |                 auto='light'
 35 |             )
 36 |         else:  # Default to BootstrapFewShot
 37 |             return dspy.teleprompt.BootstrapFewShot(
 38 |                 metric=metric,
 39 |                 max_bootstrapped_demos=config['max_bootstrapped_demos'],
 40 |                 max_labeled_demos=config['max_labeled_demos']
 41 |             )
 42 | 
 43 |     def bootstrap_dataset(self, dataset: List[Dict], num_bootstrap: int = 5) -> List[Dict]:
 44 |         indices = np.random.choice(len(dataset), size=num_bootstrap, replace=True)
 45 |         return [dataset[i] for i in indices]
 46 |     
 47 |     def _create_pipeline(self, config):
 48 |         """Create appropriate pipeline based on configured type"""
 49 |         if self.pipeline_type == PIPELINE_TYPE_ITER:
 50 |             return SearchReplaceIterPipeline(num_iters=config['num_layers'])
 51 |         return SearchReplacePipeline(num_layers=config['num_layers'])
 52 | 
 53 |     def _evaluate_pipeline(self, config, dataset_path, num_threads):
 54 |         """Evaluate pipeline with given configuration"""
 55 |         return evaluate_pipeline(
 56 |             dataset_path=dataset_path,
 57 |             num_layers=config['num_layers'],
 58 |             num_threads=num_threads,
 59 |             model=config['model'],
 60 |             temperature=config['temperature']
 61 |         )
 62 | 
 63 |     def _get_default_config(self) -> Dict:
 64 |         """Get default configuration for optimization"""
 65 |         return {
 66 |             'num_layers': 10,
 67 |             'temperature': 1.0,
 68 |             'model': "deepseek/deepseek-chat",
 69 |             'num_threads': 10,
 70 |             'num_candidates': 3,
 71 |             'max_bootstrapped_demos': 3,
 72 |             'max_labeled_demos': 4
 73 |         }
 74 | 
 75 |     def _load_dataset(self, dataset_path: str) -> List[Dict]:
 76 |         """Load dataset from JSON file"""
 77 |         with open(dataset_path) as f:
 78 |             return json.load(f)
 79 | 
 80 |     def _create_trainset(self, dataset: List[Dict]) -> List[dspy.Example]:
 81 |         """Create training set from dataset"""
 82 |         trainset = []
 83 |         # for item in dataset[:100]:  # Use first 100 examples for training
 84 |         # random sample import
 85 |         # for item_i in range(100):
 86 |         from random import sample
 87 |         sample_dataset = sample(dataset, 100)
 88 |         for item in sample_dataset:
 89 |             trainset.append(dspy.Example(
 90 |                 task=item['task'],
 91 |                 solution=item['solution']
 92 |             ).with_inputs('task'))
 93 |         return trainset
 94 | 
 95 |     def _configure_model(self, config: Dict) -> None:
 96 |         """Configure DSPy language model"""
 97 |         lm = dspy.LM(
 98 |             model=config['model'],
 99 |             temperature=config['temperature'],
100 |             cache=False
101 |         )
102 |         dspy.settings.configure(lm=lm)
103 | 
104 |     def _create_fewshot_examples(self, trainset: List[dspy.Example]) -> List[dspy.Example]:
105 |         """Create few-shot examples from training set"""
106 |         fewshot_examples = []
107 |         for example in trainset[:5]:  # Use first 5 examples for few-shot
108 |             fewshot_examples.append(dspy.Example(
109 |                 task=example.task,
110 |                 solution=example.solution
111 |             ).with_inputs('task'))
112 |         return fewshot_examples
113 | 
114 |     def optimize(self) -> Dict:
115 |         
116 |         print("\nStarting Pipeline Optimization...")
117 |         start_time = time.time()
118 |         
119 |         config = self._get_default_config()
120 |         
121 |         full_dataset = self._load_dataset(self.dataset_path)
122 |             
123 |         # Use BFS as default optimizer
124 |         optimizer_type = "bfs"
125 |         print(f"\nUsing {optimizer_type.upper()} optimizer...")
126 |         self._configure_model(config)
127 |             
128 |             # Define metric function
129 |             def metric(example, prediction, trace=None):
130 |                 try:
131 |                     pred = float(prediction.solution)
132 |                     exp = float(example.solution)
133 |                     return int(abs(pred - exp) < 0.01)
134 |                 except:
135 |                     return 0
136 |                     
137 |             teacher = None
138 |             best_accuracy = 0.0
139 |             best_pipeline = None
140 |             
141 |             num_iterations = 3
142 |             for iteration in range(num_iterations):
143 |                 print(f"\nBFS Iteration {iteration + 1}/{num_iterations}")
144 |                 
145 |                 teleprompter = self._create_teleprompter(metric, optimizer_type)
146 |                 
147 |                 # Create new student pipeline
148 |                 student = self._create_pipeline(config)
149 |                 
150 |                 trainset = self._create_trainset(full_dataset)
151 |                 # Compile with current teacher
152 |                 optimized_pipeline = teleprompter.compile(
153 |                     student,
154 |                     trainset=trainset,
155 |                     teacher=teacher
156 |                 )
157 |                 
158 |                 # Evaluate the optimized pipeline
159 |                 accuracy = self._evaluate_pipeline(config, self.dataset_path, config['num_threads'])
160 |                 
161 |                 # Update best pipeline if this one is better
162 |                 if accuracy > best_accuracy:
163 |                     best_accuracy = accuracy
164 |                     print(f"New best accuracy: {accuracy:.1%}")
165 |                     best_pipeline = optimized_pipeline
166 |                 
167 |                     # Set current optimized pipeline as teacher for next iteration
168 |                     teacher = optimized_pipeline
169 |                     print("Teacher updated")
170 | 
171 |                 
172 |                 print(f"Iteration {iteration + 1} accuracy: {accuracy:.1%}")
173 |             
174 |             accuracy = best_accuracy
175 |             
176 |             result = {
177 |                 **config,
178 |                 'accuracy': accuracy,
179 |                 'timestamp': time.time()
180 |             }
181 |             self.results_history.append(result)
182 |             
183 |             # Update best accuracy
184 |             if accuracy > self.best_accuracy:
185 |                 self.best_accuracy = accuracy
186 |                 self.best_config = config
187 |         else:
188 |             # Just evaluate baseline pipeline
189 |             accuracy = evaluate_pipeline(
190 |                 dataset_path=self.dataset_path,
191 |                 num_layers=config['num_layers'],
192 |                 num_threads=config['num_threads'],
193 |                 model=config['model'],
194 |                 temperature=config['temperature']
195 |             )
196 |             
197 |             result = {
198 |                 **config,
199 |                 'accuracy': accuracy,
200 |                 'timestamp': time.time()
201 |             }
202 |             self.results_history.append(result)
203 |             
204 |             # Update best accuracy
205 |             if accuracy > self.best_accuracy:
206 |                 self.best_accuracy = accuracy
207 |                 self.best_config = config
208 |         
209 |         elapsed = time.time() - start_time
210 |         
211 |         # Print optimization results
212 |         print("\nOptimization Results:")
213 |         print(f"Time taken: {elapsed:.1f}s")
214 |         print(f"Bootstrap iterations completed: {len(self.results_history)}")
215 |         print(f"\nBest Configuration:")
216 |         print(f"Number of layers: {self.best_config['num_layers']}")
217 |         print(f"Temperature: {self.best_config['temperature']}")
218 |         print(f"Accuracy: {self.best_accuracy:.1%}")
219 |         
220 |         # Print performance progression if we have results
221 |         if self.results_history:
222 |             print("\nPerformance History:")
223 |             for result in sorted(self.results_history, 
224 |                                key=lambda x: x['accuracy'], 
225 |                                reverse=True)[:5]:
226 |                 print(f"\nLayers: {result['num_layers']}, "
227 |                       f"Temp: {result['temperature']:.1f}, "
228 |                       f"Accuracy: {result['accuracy']:.1%}")
229 |         
230 |         return self.best_config
231 | 
232 | import argparse
233 | 
234 | def parse_args():
235 |     """Parse command line arguments"""
236 |     parser = argparse.ArgumentParser(description='Optimize residual pipeline')
237 |     parser.add_argument('--pipeline-type', type=str, default=PIPELINE_TYPE_STANDARD,
238 |                        choices=[PIPELINE_TYPE_STANDARD, PIPELINE_TYPE_ITER],
239 |                        help='Type of pipeline to optimize')
240 |     parser.add_argument('--optimizer', type=str, default="bfs",
241 |                        choices=["bfs", "mipro"],
242 |                        help='Optimizer to use (bfs=BootstrapFewShot, mipro=MIPROv2)')
243 |     parser.add_argument('--dataset', type=str, default="math_dataset.json",
244 |                        help='Path to dataset file')
245 |     parser.add_argument('--threads', type=int, default=10,
246 |                        help='Number of threads to use')
247 |     parser.add_argument('--iterations', type=int, default=3,
248 |                        help='Number of BFS iterations to run')
249 |     return parser.parse_args()
250 | 
251 | def main():
252 |     args = parse_args()
253 |     
254 |     print(f"\nOptimizing {args.pipeline_type} pipeline...")
255 |     print(f"Optimizer: {args.optimizer.upper()}")
256 |     print(f"Dataset: {args.dataset}")
257 |     print(f"Threads: {args.threads}\n")
258 |     
259 |     optimizer = PipelineOptimizer(pipeline_type=args.pipeline_type)
260 |     baseline_config = optimizer.optimize(
261 |         dataset_path=args.dataset,
262 |         num_threads=args.threads,
263 |         optimizer_type=args.optimizer,
264 |         num_iterations=args.iterations
265 |     )
266 |     
267 | 
268 | if __name__ == "__main__":
269 |     main()
270 | 


--------------------------------------------------------------------------------
/serper_search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import requests
  5 | from typing import List, Dict, Optional
  6 | 
  7 | from pprint import pprint
  8 | 
  9 | def main():
 10 |     if len(sys.argv) < 2:
 11 |         print("Usage: serper_search.py <search_query>")
 12 |         sys.exit(1)
 13 |         
 14 |     query = " ".join(sys.argv[1:])
 15 |     print(f"Searching for: {query}")
 16 |     
 17 |     searcher = SerperSearch()
 18 |     results = searcher.search(query)
 19 |     
 20 |     print("\nSearch Results:")
 21 |     pprint(results)
 22 | 
 23 | class SerperSearch:
 24 |     def __init__(self, api_key: Optional[str] = None):
 25 |         self.api_key = api_key or os.getenv("SERPER_API_KEY")
 26 |         if not self.api_key:
 27 |             raise ValueError("Serper API key not found. Set SERPER_API_KEY environment variable.")
 28 |         
 29 |     def _parse_response(self, response: Dict) -> Dict:
 30 |         """Parse the raw API response into a structured format"""
 31 |         parsed = {
 32 |             'search_parameters': response.get('searchParameters', {}),
 33 |             'organic_results': [],
 34 |             'top_stories': [],
 35 |             'related_searches': [],
 36 |             'credits_used': response.get('credits', 0)
 37 |         }
 38 |         
 39 |         # Parse organic results
 40 |         for result in response.get('organic', []):
 41 |             parsed['organic_results'].append({
 42 |                 'title': result.get('title', 'No title'),
 43 |                 'link': result.get('link', 'No link'),
 44 |                 'snippet': result.get('snippet', 'No snippet'),
 45 |                 'date': result.get('date', ''),
 46 |                 'position': result.get('position', 0),
 47 |                 'sitelinks': [
 48 |                     {
 49 |                         'title': sl.get('title', ''),
 50 |                         'link': sl.get('link', '')
 51 |                     } for sl in result.get('sitelinks', [])
 52 |                 ]
 53 |             })
 54 |             
 55 |         # Parse top stories
 56 |         for story in response.get('topStories', []):
 57 |             parsed['top_stories'].append({
 58 |                 'title': story.get('title', 'No title'),
 59 |                 'link': story.get('link', 'No link'),
 60 |                 'source': story.get('source', ''),
 61 |                 'date': story.get('date', ''),
 62 |                 'image_url': story.get('imageUrl', '')
 63 |             })
 64 |             
 65 |         # Parse related searches
 66 |         for search in response.get('relatedSearches', []):
 67 |             parsed['related_searches'].append({
 68 |                 'query': search.get('query', '')
 69 |             })
 70 |             
 71 |         return parsed
 72 | 
 73 |     def search(self, query: str, num_results: int = 5) -> Dict:
 74 |         """Perform a search using Serper API and return structured results"""
 75 |         headers = {
 76 |             'X-API-KEY': self.api_key,
 77 |             'Content-Type': 'application/json'
 78 |         }
 79 |         payload = {
 80 |             'q': query,
 81 |             'num': num_results,
 82 |             'gl': 'us',
 83 |             'hl': 'en'
 84 |         }
 85 |         
 86 |         try:
 87 |             response = requests.post(
 88 |                 'https://google.serper.dev/search',
 89 |                 headers=headers,
 90 |                 json=payload,
 91 |                 timeout=10
 92 |             )
 93 |             
 94 |             response.raise_for_status()
 95 |             raw_data = response.json()
 96 |             
 97 |             return self._parse_response(raw_data)
 98 |             
 99 |         except requests.exceptions.RequestException as e:
100 |             print(f"Search failed: {str(e)}")
101 |             if hasattr(e, 'response') and e.response:
102 |                 print(f"Response content: {e.response.text}")
103 |             return []
104 | 
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/signatures.py:
--------------------------------------------------------------------------------
 1 | import dspy
 2 | 
 3 | class TaskSplitterSignature(dspy.Signature):
 4 |     """Break down complex problems into manageable parts"""
 5 |     task = dspy.InputField(desc="The main problem to break down")
 6 |     context = dspy.InputField(desc="Any relevant background info", default="")
 7 |     subtasks = dspy.OutputField(desc="Clear steps to solve the problem")
 8 |     split_reasoning = dspy.OutputField(desc="Why these steps make sense")
 9 | 
10 | class SubtaskResultSelectorSignature(dspy.Signature):
11 |     """Pick the best solution attempt"""
12 |     subtask = dspy.InputField(desc="The specific step being solved")
13 |     attempts = dspy.InputField(desc="Different ways people tried to solve it")
14 |     selected_solution = dspy.OutputField(desc="The most correct and clear solution")
15 |     selection_reasoning = dspy.OutputField(desc="Why this solution is the best")
16 | 
17 | class SolutionSelectorSignature(dspy.Signature):
18 |     """Choose the best overall solution"""
19 |     task = dspy.InputField(desc="The original problem")
20 |     solutions = dspy.InputField(desc="Possible ways to solve it")
21 |     selection_criteria = dspy.InputField(
22 |         desc="What makes a good solution: correct, logical, clear, complete",
23 |         default="Pick the solution that is right, makes sense, is easy to follow, and solves the whole problem"
24 |     )
25 |     selected_solution = dspy.OutputField(desc="The best overall solution")
26 |     selection_reasoning = dspy.OutputField(desc="Why this is the best choice")
27 | 
28 | class MathCalculationSignature(dspy.Signature):
29 |     """Solve math problems step by step"""
30 |     task = dspy.InputField(desc="The math problem to solve")
31 |     context = dspy.InputField(desc="What we know so far", default="")
32 |     reasoning = dspy.OutputField(desc="Clear steps to solve it")
33 |     solution = dspy.OutputField(desc="The final answer (must be a number)")
34 |     notes_output = dspy.OutputField(desc="Things to remember for next time", default="")
35 |     iteration_control = dspy.OutputField(
36 |         desc="'continue' to keep working, 'terminate' if we're sure it's right",
37 |         default="continue"
38 |     )
39 | 


--------------------------------------------------------------------------------
/tree_of_thoughts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import dspy
  4 | import json
  5 | import time
  6 | import logging
  7 | from typing import List, Tuple, Dict, Optional
  8 | from concurrent.futures import ThreadPoolExecutor, Future
  9 | from math_calculator import MathCalculator, MathCalculationSignature
 10 | 
 11 | # Configure logging
 12 | logging.basicConfig(
 13 |     level=logging.INFO,
 14 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 15 | )
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | class ReasoningTree:
 19 |     def __init__(self, calculator: MathCalculator, tree_id: int):
 20 |         self.calculator = calculator
 21 |         self.thoughts = []
 22 |         self.score = 0
 23 |         self.tree_id = tree_id
 24 |         self.expertise = {}  # Track success rates by problem type
 25 |         self.shared_insights = []  # Insights from other trees
 26 |         
 27 |     def generate_thought(self, task: str, context: str = "") -> dspy.Prediction:
 28 |         """Generate a new thought using this tree's calculator.
 29 |         
 30 |         Args:
 31 |             task: The math task to solve
 32 |             context: Context for generating the thought
 33 |             
 34 |         Returns:
 35 |             Generated thought as a Prediction object
 36 |         """
 37 |         # Incorporate shared insights from other trees
 38 |         if self.shared_insights:
 39 |             context += "\nShared Insights:\n" + "\n".join(self.shared_insights)
 40 |             
 41 |         # Generate thought with context
 42 |         thought = self.calculator.forward(task=task, context=context)
 43 |         self.thoughts.append(thought)
 44 |         
 45 |         # Update expertise based on problem type
 46 |         problem_type = self._classify_problem(task)
 47 |         self.expertise[problem_type] = self.expertise.get(problem_type, 0) + 1
 48 |         
 49 |         return thought
 50 |         
 51 |     def _classify_problem(self, task: str) -> str:
 52 |         """Classify problem type based on operators"""
 53 |         if any(op in task for op in ['+', '-']):
 54 |             return "arithmetic"
 55 |         elif any(op in task for op in ['*', '/']):
 56 |             return "algebraic"
 57 |         elif any(op in task for op in ['^', '√']):
 58 |             return "advanced"
 59 |         return "unknown"
 60 |         
 61 |     def evaluate_thought(self, thought: dspy.Prediction, expected: float) -> float:
 62 |         """Evaluate a thought and update tree score"""
 63 |         try:
 64 |             pred_solution = float(thought.solution)
 65 |             accuracy = int(abs(pred_solution - expected) < 0.01)
 66 |             self.score += accuracy
 67 |             return accuracy
 68 |         except (ValueError, TypeError, AttributeError) as e:
 69 |             logger.warning(f"Error evaluating thought: {e}")
 70 |             return 0
 71 | 
 72 | class ForestOfThoughts:
 73 |     def __init__(self, num_trees: int = 3, num_thoughts: int = 3, max_iterations: int = 5):
 74 |         """Initialize Forest of Thoughts with multiple reasoning trees.
 75 |         
 76 |         Args:
 77 |             num_trees: Number of parallel reasoning trees to use
 78 |             num_thoughts: Number of thoughts to generate per tree
 79 |             max_iterations: Maximum iterations for solving each task
 80 |         """
 81 |         self.trees = [ReasoningTree(MathCalculator(), i) for i in range(num_trees)]
 82 |         self.num_thoughts = num_thoughts
 83 |         self.max_iterations = max_iterations
 84 |         self.consensus_threshold = 0.7  # Initial consensus threshold
 85 |         self.adaptive_threshold = True  # Enable adaptive thresholding
 86 |         self.communication_interval = 2  # Share insights every N iterations
 87 |         logger.info(f"Initialized ForestOfThoughts with {num_trees} trees, {num_thoughts} thoughts per tree")
 88 | 
 89 |     def generate_thoughts(self, task: str, iteration: int, best_score: float) -> List[dspy.Prediction]:
 90 |         """Generate thoughts dynamically based on current state.
 91 |         
 92 |         Args:
 93 |             task: The math task to solve
 94 |             iteration: Current iteration number
 95 |             best_score: Best score achieved so far
 96 |             
 97 |         Returns:
 98 |             List of predictions from all trees
 99 |         """
100 |         thoughts = []
101 |         with ThreadPoolExecutor() as executor:
102 |             futures = []
103 |             
104 |             for tree in self.trees:
105 |                 # Base context from shared insights
106 |                 base_context = "\nShared Insights:\n" + "\n".join(tree.shared_insights) if tree.shared_insights else ""
107 |                 
108 |                 # Generate initial thought with current state context
109 |                 context = (
110 |                     f"Iteration: {iteration}\n"
111 |                     f"Best Score: {best_score}\n"
112 |                     f"{base_context}\n"
113 |                     "Initial Approach: Solve the problem directly"
114 |                 )
115 |                 futures.append(executor.submit(tree.generate_thought, task, context))
116 |                 
117 |                 # Generate alternative thoughts based on iteration and progress
118 |                 if iteration > 0:
119 |                     # If we have some progress, try refining approaches
120 |                     if best_score > 0:
121 |                         context = (
122 |                             f"Iteration: {iteration}\n"
123 |                             f"Best Score: {best_score}\n"
124 |                             f"{base_context}\n"
125 |                             "Approach: Refine the best solution found so far"
126 |                         )
127 |                         futures.append(executor.submit(tree.generate_thought, task, context))
128 |                         
129 |                         context = (
130 |                             f"Iteration: {iteration}\n"
131 |                             f"Best Score: {best_score}\n"
132 |                             f"{base_context}\n"
133 |                             "Approach: Combine elements from previous solutions"
134 |                         )
135 |                         futures.append(executor.submit(tree.generate_thought, task, context))
136 |                     
137 |                     # If we're stuck, try more creative approaches
138 |                     if best_score == 0 and iteration > self.max_iterations // 2:
139 |                         context = (
140 |                             f"Iteration: {iteration}\n"
141 |                             f"Best Score: {best_score}\n"
142 |                             f"{base_context}\n"
143 |                             "Approach: Try an unconventional method"
144 |                         )
145 |                         futures.append(executor.submit(tree.generate_thought, task, context))
146 |                         
147 |                         context = (
148 |                             f"Iteration: {iteration}\n"
149 |                             f"Best Score: {best_score}\n"
150 |                             f"{base_context}\n"
151 |                             "Approach: Break the problem into smaller parts"
152 |                         )
153 |                         futures.append(executor.submit(tree.generate_thought, task, context))
154 |             
155 |             # Collect results
156 |             for future in futures:
157 |                 try:
158 |                     thought = future.result()
159 |                     if thought:  # Only add valid thoughts
160 |                         thoughts.append(thought)
161 |                 except Exception as e:
162 |                     logger.warning(f"Error generating thought: {e}")
163 |                     
164 |         return thoughts
165 | 
166 |     def evaluate_thoughts(
167 |         self, 
168 |         thoughts: List[dspy.Prediction], 
169 |         expected: float
170 |     ) -> List[Tuple[dspy.Prediction, int]]:
171 |         """Evaluate thoughts using ToT approach.
172 |         
173 |         Args:
174 |             thoughts: List of predictions to evaluate
175 |             expected: Expected solution value
176 |             
177 |         Returns:
178 |             List of tuples containing (prediction, score)
179 |         """
180 |         scored = []
181 |         with ThreadPoolExecutor() as executor:
182 |             futures = []
183 |             for thought in thoughts:
184 |                 # Evaluate each thought independently
185 |                 futures.append(executor.submit(self._evaluate_single_thought, thought, expected))
186 |             
187 |             # Collect results
188 |             for future in futures:
189 |                 try:
190 |                     scored.append(future.result())
191 |                 except Exception as e:
192 |                     logger.warning(f"Error evaluating thought: {e}")
193 |                     scored.append((None, 0))
194 |                     
195 |         return [s for s in scored if s[0] is not None]
196 |         
197 |     def _evaluate_single_thought(self, thought: dspy.Prediction, expected: float) -> Tuple[dspy.Prediction, int]:
198 |         """Evaluate a single thought using multiple criteria."""
199 |         # Find which tree generated this thought
200 |         for tree in self.trees:
201 |             if thought in tree.thoughts:
202 |                 # Evaluate correctness
203 |                 accuracy = tree.evaluate_thought(thought, expected)
204 |                 
205 |                 # Evaluate reasoning quality
206 |                 reasoning_score = self._evaluate_reasoning(thought.reasoning)
207 |                 
208 |                 # Combine scores
209 |                 score = int(accuracy * 0.7 + reasoning_score * 0.3)
210 |                 return (thought, score)
211 |                 
212 |         logger.warning("Thought not found in any tree")
213 |         return (thought, 0)
214 |         
215 |     def _evaluate_reasoning(self, reasoning: str) -> float:
216 |         """Evaluate the quality of reasoning."""
217 |         # Simple heuristic - count reasoning steps
218 |         steps = reasoning.split('\n')
219 |         step_count = len([s for s in steps if s.strip()])
220 |         
221 |         # Normalize score between 0 and 1
222 |         return min(1.0, step_count / 10.0)
223 | 
224 |     def solve(self, task: str, expected: float) -> Optional[dspy.Prediction]:
225 |         """Main reasoning process using ToT approach.
226 |         
227 |         Args:
228 |             task: The math task to solve
229 |             expected: Expected solution value
230 |             
231 |         Returns:
232 |             Best prediction found, or None if no valid solution
233 |         """
234 |         best = None
235 |         best_score = 0
236 |         
237 |         for iteration in range(self.max_iterations):
238 |             logger.info(f"Iteration {iteration + 1}/{self.max_iterations}")
239 |             
240 |             # Generate thoughts dynamically based on current state
241 |             thoughts = self.generate_thoughts(task, iteration, best_score)
242 |             
243 |             # Evaluate thoughts using multiple criteria
244 |             scored = self.evaluate_thoughts(thoughts, expected)
245 |             
246 |             # Select best thought for this iteration
247 |             current_best, score = max(scored, key=lambda x: x[1])
248 |             
249 |             # Update best overall solution
250 |             if score > best_score:
251 |                 best = current_best
252 |                 best_score = score
253 |                 logger.info(f"New best solution found with score {best_score}")
254 |                 
255 |             # Share insights between trees
256 |             if iteration > 0 and iteration % self.communication_interval == 0:
257 |                 self._share_insights(expected)
258 |                 
259 |             # Check for termination conditions
260 |             if best_score == 1:  # Perfect solution found
261 |                 logger.info("Perfect solution found, terminating early")
262 |                 break
263 |                 
264 |             # Update consensus threshold adaptively
265 |             if self.adaptive_threshold:
266 |                 self._update_consensus_threshold(scored)
267 |                 
268 |         return best
269 |         
270 |     def _share_insights(self, expected: float) -> None:
271 |         """Share insights between trees
272 |         
273 |         Args:
274 |             expected: The expected solution value for correctness checking
275 |         """
276 |         # Collect successful thoughts from all trees
277 |         insights = []
278 |         for tree in self.trees:
279 |             insights.extend([
280 |                 thought.reasoning for thought in tree.thoughts
281 |                 if self._is_correct(thought.solution, expected)
282 |             ])
283 |         
284 |         # Distribute insights to all trees
285 |         for tree in self.trees:
286 |             tree.shared_insights = insights[:]  # Copy insights
287 |             
288 |         logger.info(f"Shared {len(insights)} insights across trees")
289 |         
290 |     def _update_consensus_threshold(self, scored_thoughts: List[Tuple[dspy.Prediction, int]]) -> None:
291 |         """Adaptively update consensus threshold based on performance"""
292 |         correct_count = sum(score for _, score in scored_thoughts)
293 |         total = len(scored_thoughts)
294 |         
295 |         if total > 0:
296 |             accuracy = correct_count / total
297 |             # Increase threshold if accuracy is high, decrease if low
298 |             self.consensus_threshold = min(0.9, max(0.5, accuracy))
299 |             logger.info(f"Updated consensus threshold to {self.consensus_threshold:.2f}")
300 |         
301 |     def _find_consensus(self, scored_thoughts: List[Tuple[dspy.Prediction, int]]) -> Optional[dspy.Prediction]:
302 |         """Find a consensus solution across trees.
303 |         
304 |         Args:
305 |             scored_thoughts: List of scored predictions
306 |             
307 |         Returns:
308 |             Consensus prediction if found, else None
309 |         """
310 |         # Group thoughts by solution
311 |         solution_counts = {}
312 |         for thought, score in scored_thoughts:
313 |             if score == 1:  # Only consider correct solutions
314 |                 solution_counts[thought.solution] = solution_counts.get(thought.solution, 0) + 1
315 |                 
316 |         # Find solution with highest agreement
317 |         if solution_counts:
318 |             best_solution = max(solution_counts.keys(), key=lambda x: solution_counts[x])
319 |             if solution_counts[best_solution] >= self.consensus_threshold * len(self.trees):
320 |                 # Return first thought with this solution
321 |                 for thought, _ in scored_thoughts:
322 |                     if thought.solution == best_solution:
323 |                         return thought
324 |         return None
325 | 
326 |     def evaluate(
327 |         self, 
328 |         dataset: List[Dict[str, str]], 
329 |         num_threads: int = 10
330 |     ) -> Dict[str, float]:
331 |         """Evaluate model on dataset.
332 |         
333 |         Args:
334 |             dataset: List of math tasks with solutions
335 |             num_threads: Number of parallel threads to use
336 |             
337 |         Returns:
338 |             Dictionary containing evaluation metrics
339 |         """
340 |         logger.info(f"Starting evaluation on {len(dataset[:100])} tasks")
341 |         correct = 0
342 |         start = time.time()
343 |         
344 |         with ThreadPoolExecutor(max_workers=num_threads) as executor:
345 |             # Create list of tasks with their corresponding items
346 |             tasks = [(item['task'], float(item['solution'])) for item in dataset[:100]]
347 |         
348 |             # Submit tasks and keep track of which item they correspond to
349 |             futures = {
350 |                 executor.submit(self.solve, task, solution): (task, solution)
351 |                 for task, solution in tasks
352 |             }
353 |         
354 |             for future in futures:
355 |                 result = future.result()
356 |                 if result is None:
357 |                     continue
358 |                     
359 |                 # Get the solution from the original task
360 |                 _, expected_solution = futures[future]
361 |                 correct += int(abs(float(result.solution) - expected_solution) < 0.01)
362 |         
363 |         accuracy = correct / len(dataset[:100])
364 |         elapsed = time.time() - start
365 |         
366 |         logger.info(f"Evaluation complete - Accuracy: {accuracy:.1%}, Time: {elapsed:.2f}s")
367 |         
368 |         return {
369 |             'accuracy': accuracy,
370 |             'time': elapsed,
371 |             'correct': correct
372 |         }
373 | 
374 | def configure_dspy() -> None:
375 |     """Configure DSPy settings."""
376 |     lm = dspy.LM(model="deepseek/deepseek-chat", temperature=0.3, cache=False)
377 |     dspy.settings.configure(lm=lm)
378 |     logger.info("DSPy configured with DeepSeek model")
379 | 
380 | def main() -> None:
381 |     """Main execution function."""
382 |     configure_dspy()
383 |     
384 |     # Load dataset
385 |     try:
386 |         with open("math_dataset.json") as f:
387 |             dataset = json.load(f)
388 |         logger.info(f"Loaded dataset with {len(dataset)} items")
389 |     except (FileNotFoundError, json.JSONDecodeError) as e:
390 |         logger.error(f"Error loading dataset: {e}")
391 |         return
392 |     
393 |     # Create and evaluate
394 |     fot = ForestOfThoughts()
395 |     results = fot.evaluate(dataset)
396 |     
397 |     # Save results
398 |     try:
399 |         with open("forest_results.json", "w") as f:
400 |             json.dump(results, f, indent=2)
401 |         logger.info("Results saved to forest_results.json")
402 |     except IOError as e:
403 |         logger.error(f"Error saving results: {e}")
404 | 
405 | if __name__ == "__main__":
406 |     main()
407 | 


--------------------------------------------------------------------------------