├── .gitignore ├── LICENSE ├── README.md ├── benchmark.py ├── deepseek_aider_evaluator.py ├── dynamic_fewshot_routing.py ├── jeopardy_dataset.py ├── math_calculator.py ├── math_calculator_optimizer.py ├── math_dataset_generator.py ├── math_evaluator.py ├── math_multiplication_optimizer.py ├── reasoning_pipeline.py ├── researcher.py ├── researcher_optimizer.py ├── residual_pipeline.py ├── residual_pipeline_optimizer.py ├── serper_search.py ├── signatures.py └── tree_of_thoughts.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | .aider* 173 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Tom Dörr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
4 | Exploring AI reasoning capabilities using DSPy 5 |
6 | 7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
26 | This project explores how DSPy can be used to implement and analyze AI reasoning processes. 27 | It's a work in progress for experimenting with different reasoning approaches and patterns. 28 |
29 | 30 | 37 | 38 | ## What it does 39 | 40 | - Implements iterative reasoning processes 41 | - Analyzes reasoning patterns and logical validity 42 | - Tracks reasoning performance metrics 43 | - Provides detailed reasoning analysis 44 | 45 | ## Current Limitations 46 | 47 | This is an experimental project with several known limitations: 48 | 49 | - Reasoning quality depends heavily on the underlying model 50 | - Analysis capabilities are still basic 51 | - Performance metrics are simple 52 | - Needs more diverse test cases 53 | 54 | ## System Components 55 | 56 | ### 1. Math Multiplication Optimizer 57 | - Implements multiplication solver using DSPy Chain-of-Thought 58 | - Uses MIPROv2 for optimization 59 | - Generates random multiplication problems for training 60 | - Evaluates accuracy on validation set 61 | - Example usage: 62 | ```bash 63 | python3 math_multiplication_optimizer.py 64 | ``` 65 | 66 | ### 2. Residual Pipeline Optimizer 67 | - Optimizes search-replace pipelines using BootstrapFewShot or MIPROv2 68 | - Supports both standard and iterative pipeline types 69 | - Tracks optimization history and best configurations 70 | - Example usage: 71 | ```bash 72 | python3 residual_pipeline_optimizer.py --pipeline-type standard --optimizer mipro 73 | ``` 74 | 75 | ### 3. Jeopardy Dataset Generator 76 | Generates challenging Jeopardy-style questions across multiple categories: 77 | - Creates initial questions and hints 78 | - Produces more challenging final questions 79 | - Saves dataset to `jeopardy_dataset.json` 80 | 81 | ### 2. Reasoning Pipeline 82 | Implements iterative reasoning with: 83 | - Context tracking and history 84 | - Objective achievement analysis 85 | - Formal logical fallacy detection: 86 | - Affirming the consequent 87 | - Denying the antecedent 88 | - Undistributed middle 89 | - Illicit major/minor 90 | - Mathematical validation 91 | - Termination logic 92 | 93 | ### 3. Benchmark System 94 | Measures pipeline performance by: 95 | - Running reasoning pipeline on generated questions 96 | - Verifying answers using semantic matching 97 | - Tracking metrics: 98 | - Accuracy 99 | - Iterations per question 100 | - Processing time 101 | - Fallacy detection rates 102 | 103 | ## Installation 104 | 105 | 1. Clone the repository: 106 | ```bash 107 | git clone https://github.com/tom-doerr/dspy_reasoning.git 108 | cd dspy_reasoning 109 | ``` 110 | 111 | 2. Install dependencies: 112 | ```bash 113 | pip install -r requirements.txt 114 | ``` 115 | 116 | 3. Configure environment variables: 117 | ```bash 118 | export DSPY_MODEL=deepseek/deepseek-chat 119 | ``` 120 | 121 | 4. (Optional) Install development dependencies: 122 | ```bash 123 | pip install -r requirements-dev.txt 124 | ``` 125 | 126 | ## Usage 127 | 128 | 1. Generate Jeopardy questions: 129 | ```bash 130 | ./jeopardy_dataset.py -n 50 131 | ``` 132 | 133 | 2. Run reasoning pipeline benchmark: 134 | ```bash 135 | ./benchmark.py 136 | ``` 137 | 138 | 3. View results in `reasoning_benchmark.json` 139 | 140 | ## Configuration 141 | 142 | Customize settings in the scripts: 143 | - `jeopardy_dataset.py`: Adjust categories and question count 144 | - `reasoning_pipeline.py`: Modify reasoning parameters 145 | - `benchmark.py`: Change evaluation metrics 146 | 147 | ## Performance Metrics 148 | 149 | The system tracks comprehensive performance metrics across material batches: 150 | 151 | - **Batch Processing Time**: Average time per batch 152 | - **Batch Accuracy**: Percentage of correct solutions per batch 153 | - **Iteration Efficiency**: Average reasoning iterations per problem 154 | - **Fallacy Detection Rate**: Percentage of detected logical fallacies 155 | - **Objective Achievement**: Success rate in meeting problem objectives 156 | 157 | Benchmark results include aggregate statistics across all batches: 158 | - Overall accuracy 159 | - Average iterations per question 160 | - Total processing time 161 | - Fallacy detection rates 162 | - Objective achievement scores 163 | 164 | ## Contributing 165 | 166 | Contributions are welcome! Please follow these steps: 167 | 168 | 1. Fork the repository 169 | 2. Create a feature branch (`git checkout -b feature/AmazingFeature`) 170 | 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) 171 | 4. Push to the branch (`git push origin feature/AmazingFeature`) 172 | 5. Open a pull request 173 | 174 | Please make sure to update tests as appropriate and follow the coding style of the project. 175 | 176 | ## License 177 | 178 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 179 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import time 3 | import json 4 | import dspy 5 | from jeopardy_dataset import JeopardyDatasetGenerator 6 | from reasoning_pipeline import run_reasoning_pipeline 7 | 8 | class VerifyAnswerSignature(dspy.Signature): 9 | predicted_answer = dspy.InputField(desc="The answer predicted by the model") 10 | correct_answer = dspy.InputField(desc="The known correct answer") 11 | verification = dspy.OutputField(desc="True if answers match semantically, False otherwise") 12 | 13 | class AnswerVerifier(dspy.Module): 14 | def __init__(self): 15 | super().__init__() 16 | self.verify = dspy.ChainOfThought(VerifyAnswerSignature) 17 | 18 | def forward(self, predicted_answer, correct_answer): 19 | return self.verify(predicted_answer=predicted_answer, correct_answer=correct_answer) 20 | 21 | def verify_answer_match(predicted_answer, correct_answer): 22 | """Check if the predicted answer matches the correct answer using semantic verification""" 23 | verifier = AnswerVerifier() 24 | result = verifier(predicted_answer, correct_answer) 25 | return result.verification.lower().strip() in ["true", "yes", "correct"] 26 | 27 | def benchmark_reasoning_pipeline(): 28 | print("Benchmarking Reasoning Pipeline Performance...") 29 | 30 | # Load generated dataset 31 | with open("jeopardy_dataset.json") as f: 32 | dataset = json.load(f) 33 | 34 | # Track pipeline performance metrics 35 | pipeline_metrics = { 36 | "total_questions": len(dataset), 37 | "total_iterations": 0, 38 | "correct_answers": 0, 39 | "time_seconds": 0 40 | } 41 | 42 | start_time = time.time() 43 | for i, item in enumerate(dataset, 1): 44 | print(f"\nTesting Pipeline on Question {i}/{len(dataset)}") 45 | # Run reasoning pipeline directly 46 | context = f""" 47 | Final Question: {item["question"]} 48 | Hint: {item["hint"]} 49 | Answer: {item["answer"]} 50 | """ 51 | objective = "Determine the correct answer to the question using the provided hint" 52 | 53 | reasoning_output = [] 54 | def capture_reasoning(iteration, context, objective, result): 55 | reasoning_output.append({ 56 | "iteration": iteration, 57 | "context": context, 58 | "objective": objective, 59 | "result": result 60 | }) 61 | 62 | run_reasoning_pipeline(context, objective, callback=capture_reasoning) 63 | 64 | # Check if final reasoning output matches correct answer 65 | if reasoning_output: 66 | final_result = reasoning_output[-1]["result"] 67 | is_correct = verify_answer_match(final_result.reasoning_output, item["answer"]) 68 | pipeline_metrics["correct_answers"] += int(is_correct) 69 | pipeline_metrics["total_iterations"] += len(reasoning_output) 70 | 71 | # Print progress 72 | print(f"\nCurrent Progress: {i}/{pipeline_metrics['total_questions']}") 73 | print(f"Iterations: {len(reasoning_output)}") 74 | current_accuracy = pipeline_metrics["correct_answers"] / i 75 | print(f"Current Accuracy: {current_accuracy:.1%}") 76 | 77 | elapsed_time = time.time() - start_time 78 | print() # New line after progress 79 | 80 | # Calculate final averages 81 | pipeline_metrics["time_seconds"] = elapsed_time 82 | pipeline_metrics["accuracy"] = pipeline_metrics["correct_answers"] / pipeline_metrics["total_questions"] 83 | pipeline_metrics["average_iterations"] = pipeline_metrics["total_iterations"] / pipeline_metrics["total_questions"] 84 | 85 | # Save results 86 | with open("reasoning_benchmark.json", "w") as f: 87 | json.dump(pipeline_metrics, f, indent=2) 88 | 89 | print(f"Tested pipeline on {pipeline_metrics['total_questions']} questions in {elapsed_time:.2f} seconds") 90 | print(f"Average Iterations: {pipeline_metrics['average_iterations']:.1f}") 91 | print(f"Answer Accuracy: {pipeline_metrics['accuracy']:.1%}") 92 | 93 | if __name__ == "__main__": 94 | benchmark_reasoning_pipeline() 95 | print("\nBenchmark results saved to reasoning_benchmark.json") 96 | -------------------------------------------------------------------------------- /deepseek_aider_evaluator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from deepeval.benchmarks import Aider 4 | from deepeval.benchmarks.tasks import AiderTask 5 | from deepeval.models import DeepSeekModel 6 | import dspy 7 | import json 8 | from tqdm import tqdm 9 | 10 | class DeepSeekEvaluator: 11 | def __init__(self): 12 | # Configure DeepSeek model 13 | self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=0.3, cache=False) 14 | dspy.settings.configure(lm=self.lm) 15 | 16 | # Initialize DeepEval Aider benchmark 17 | self.benchmark = Aider( 18 | tasks=[ 19 | AiderTask.CODE_EDITING, 20 | AiderTask.CODE_REFACTORING 21 | ], 22 | n=100 # Number of code generation samples 23 | ) 24 | 25 | def evaluate(self): 26 | print("Starting DeepSeek evaluation on Aider benchmark...") 27 | 28 | # Create DeepSeek model wrapper for DeepEval 29 | class DeepSeekWrapper(DeepSeekModel): 30 | def __init__(self): 31 | super().__init__() 32 | self.model = self.lm 33 | 34 | def generate_samples(self, prompt: str, n: int, temperature: float) -> tuple[str, float]: 35 | # Use DSPy's DeepSeek model for generation 36 | result = self.model(prompt) 37 | return result, 1.0 # Return generated text and confidence score 38 | 39 | def load_model(self): 40 | # Initialize the model 41 | self.model = self.lm 42 | 43 | # Evaluate the model 44 | self.benchmark.evaluate(model=DeepSeekWrapper(), k=10) 45 | 46 | # Print results 47 | print("\nEvaluation Results:") 48 | print(f"Overall Score: {self.benchmark.overall_score:.1%}") 49 | print("\nTask-wise Scores:") 50 | for task, score in self.benchmark.task_scores.items(): 51 | print(f"{task}: {score:.1%}") 52 | 53 | # Save results 54 | results = { 55 | "overall_score": self.benchmark.overall_score, 56 | "task_scores": self.benchmark.task_scores, 57 | "model": "deepseek/deepseek-chat", 58 | "temperature": 0.3 59 | } 60 | 61 | with open("deepseek_aider_results.json", "w") as f: 62 | json.dump(results, f, indent=2) 63 | 64 | print("\nResults saved to deepseek_aider_results.json") 65 | 66 | if __name__ == "__main__": 67 | evaluator = DeepSeekEvaluator() 68 | evaluator.evaluate() 69 | -------------------------------------------------------------------------------- /dynamic_fewshot_routing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from simpledspy import pipe 4 | import numpy as np 5 | from math_calculator import MathCalculator 6 | import random 7 | 8 | 9 | calculator = MathCalculator() 10 | 11 | 12 | def generate_multiplication_task(num_digits): 13 | first_digit = np.random.randint(0, 10**num_digits) 14 | second_digit = np.random.randint(0, 10**num_digits) 15 | solution = first_digit * second_digit 16 | task = f"{first_digit} * {second_digit}" 17 | return task, solution 18 | 19 | 20 | def generate_program_reasoning_task(): 21 | num_digits = 4 22 | first_digit = np.random.randint(0, 10**num_digits) 23 | second_digit = np.random.randint(0, 10**num_digits) 24 | if second_digit > 5000: 25 | solution = 10 26 | else: 27 | solution = first_digit * second_digit 28 | # return 100 29 | task = f"f({first_digit}, {second_digit}) = " 30 | return task, solution 31 | 32 | def sample_memories(memory): 33 | return_list = [] 34 | # for sample in memory: 35 | for i, sample in enumerate(memory): 36 | # sample with prob of weight 37 | if random.random() < sample['weight']: 38 | sample['i'] = i 39 | return_list.append(sample) 40 | 41 | return return_list 42 | 43 | def construct_prompt(fewshot_samples): 44 | fewshot_str = "" 45 | hypothesis_str = "" 46 | for sample in fewshot_samples: 47 | if 'task' in sample: 48 | # fewshot_str += f"Task: {sample['task']}\nOutput: {sample['output']}\n" 49 | fewshot_str += f"Task: {sample['task']}\nOutput: {sample['output']}\Output score: {sample['metric']}\n" 50 | elif 'hypothesis' in sample: 51 | hypothesis_str += f"Hypothesis: {sample['hypothesis']}\n" 52 | 53 | return fewshot_str, hypothesis_str 54 | 55 | instruction = '' 56 | metric_values = [] 57 | score_values = [] 58 | memory = [] 59 | iteration = 0 60 | while True: 61 | # for i in range(10): 62 | task, solution = generate_multiplication_task(4) 63 | # task, solution = generate_program_reasoning_task() 64 | memory_samples = sample_memories(memory) 65 | # memory_str = construct_prompt(memory_samples) 66 | fewshot_str, hypothesis_str = construct_prompt(memory_samples) 67 | memory_str = f"Hypothesis: {hypothesis_str}\n Fewshot Examples:\n{fewshot_str}" 68 | input_ = f"Current task: {task}\n{memory_str}Task: {task}\nOutput: " 69 | reasoning, result, new_hypothesis = pipe(instruction, input_) 70 | output = (reasoning, result) 71 | # check if number 72 | if result.isdigit() and int(result) == solution: 73 | # metric_values.append(1) 74 | # metric_value = 1 75 | metric_value = min(1, 1/abs(float(result) - solution + 0.00001)) 76 | else: 77 | # metric_values.append(0) 78 | metric_value = 0 79 | 80 | num_memory_samples = len(memory_samples) 81 | score = metric_value - (num_memory_samples/100) 82 | score_values.append(score) 83 | avg_score = np.mean(score_values[-100:]) 84 | 85 | metric_values.append(metric_value) 86 | avg_metric = np.mean(metric_values[-100:]) 87 | num_fewshot_samples = len([sample for sample in memory_samples if 'task' in sample]) 88 | num_hypothesis_samples = len([sample for sample in memory_samples if 'hypothesis' in sample]) 89 | weight = avg_score if avg_score > 0 else 0.1 90 | memory.append({'hypothesis': new_hypothesis, 'weight': weight}) 91 | memory.append({'task': task, 'output': output, 'metric': metric_value, 'weight': weight}) 92 | # if metric_value == 1: 93 | # memory.append({'task': task, 'output': output, 'metric': metric_value, 'weight': weight}) 94 | 95 | for sample in memory_samples: 96 | penalty_factor = 0.5 97 | # if metric_value == 1: 98 | if score > avg_score: 99 | memory[sample['i']]['weight'] *= ((1-penalty_factor)/avg_metric) + penalty_factor 100 | else: 101 | memory[sample['i']]['weight'] *= penalty_factor 102 | 103 | if memory[sample['i']]['weight'] > 1: 104 | memory[sample['i']]['weight'] = 1 105 | 106 | 107 | 108 | 109 | print(f"input_: {input_}") 110 | print(f"reasoning: {reasoning}") 111 | print(f"hypothesis_str: {hypothesis_str}") 112 | print(f'Hypothesis: {new_hypothesis}') 113 | print(f"iter: {iteration}, task: {task}, Solution: {solution}, result: {result}, num_fs: {num_fewshot_samples}, num_hypo: {num_hypothesis_samples}, Avg Metric: {avg_metric}") 114 | iteration += 1 115 | 116 | 117 | -------------------------------------------------------------------------------- /jeopardy_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dspy 4 | import json 5 | from tqdm import tqdm 6 | 7 | # Configure the LM with temperature=1.5 and no caching 8 | lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.5, cache=False) 9 | dspy.settings.configure(lm=lm) 10 | 11 | # Define signatures for three-step question generation 12 | class GenerateAnswerSignature(dspy.Signature): 13 | category = dspy.InputField(desc="The category for the question") 14 | answer = dspy.OutputField(desc="A challenging answer for a Jeopardy question. Generate just the answer, not the question.") 15 | 16 | class GenerateInitialQuestionSignature(dspy.Signature): 17 | category = dspy.InputField(desc="The category for the question") 18 | answer = dspy.InputField(desc="The specific answer to create a question for") 19 | question = dspy.OutputField(desc="A Jeopardy-style clue that leads to the answer") 20 | 21 | class GenerateHintSignature(dspy.Signature): 22 | category = dspy.InputField(desc="The category for the question") 23 | answer = dspy.InputField(desc="The specific answer to create a hint for") 24 | initial_question = dspy.InputField(desc="The initial question that directly leads to the answer") 25 | hint = dspy.OutputField(desc="An indirect clue that points to the answer without repeating information from the initial question") 26 | 27 | class GenerateChallengingQuestionSignature(dspy.Signature): 28 | category = dspy.InputField(desc="The category for the question") 29 | answer = dspy.InputField(desc="The specific answer to create a question for") 30 | hint = dspy.InputField(desc="An indirect clue that points to the answer") 31 | question = dspy.OutputField(desc="A challenging Jeopardy-style clue that incorporates the hint and requires reasoning to reach the answer") 32 | 33 | class JeopardyDatasetGenerator(dspy.Module): 34 | def __init__(self): 35 | super().__init__() 36 | self.generate_answer = dspy.ChainOfThought(GenerateAnswerSignature) 37 | self.generate_initial_question = dspy.ChainOfThought(GenerateInitialQuestionSignature) 38 | self.generate_hint = dspy.ChainOfThought(GenerateHintSignature) 39 | self.generate_challenging_question = dspy.ChainOfThought(GenerateChallengingQuestionSignature) 40 | 41 | def generate_dataset(self, categories, num_questions_per_category=1): 42 | dataset = [] 43 | total_questions = len(categories) * num_questions_per_category 44 | 45 | # Single progress bar for all questions 46 | with tqdm(total=total_questions, desc="Generating Questions") as pbar: 47 | for category in categories: 48 | for _ in range(num_questions_per_category): 49 | # First generate a challenging answer 50 | answer_result = self.generate_answer(category=category) 51 | 52 | # First generate an initial direct question 53 | initial_question_result = self.generate_initial_question( 54 | category=category, 55 | answer=answer_result.answer 56 | ) 57 | 58 | # Generate a hint that points to the answer without repeating the initial question 59 | hint_result = self.generate_hint( 60 | category=category, 61 | answer=answer_result.answer, 62 | initial_question=initial_question_result.question 63 | ) 64 | 65 | # Generate a more challenging question using the hint 66 | question_result = self.generate_challenging_question( 67 | category=category, 68 | answer=answer_result.answer, 69 | hint=hint_result.hint 70 | ) 71 | 72 | # Create the dataset entry 73 | entry = { 74 | "category": category, 75 | "question": question_result.question, 76 | "answer": answer_result.answer, 77 | "initial_question": initial_question_result.question, 78 | "hint": hint_result.hint 79 | } 80 | dataset.append(entry) 81 | 82 | # Print formatted output 83 | print("\nGenerated Question:") 84 | print(f"Category: {entry['category']}") 85 | print(f"Initial Question: {entry['initial_question']}") 86 | print(f"Hint: {entry['hint']}") 87 | print(f"Final Question: {entry['question']}") 88 | print(f"Answer: {entry['answer']}") 89 | print("-" * 80) 90 | 91 | # Update progress bar 92 | pbar.update(1) 93 | return dataset 94 | 95 | import argparse 96 | 97 | if __name__ == "__main__": 98 | # Initialize the generator 99 | generator = JeopardyDatasetGenerator() 100 | 101 | # Define some categories 102 | categories = [ 103 | "History", 104 | "Science & Nature", 105 | "Literature", 106 | "Pop Culture", 107 | "Geography", 108 | "Technology", 109 | "Computers", 110 | "Artificial Intelligence", 111 | "LLMs", 112 | "Deep Learning", 113 | ] 114 | 115 | # Set up argument parser 116 | parser = argparse.ArgumentParser(description="Generate Jeopardy questions") 117 | parser.add_argument("-n", "--num_questions", type=int, default=50, 118 | help="Number of questions to generate (default: 50)") 119 | args = parser.parse_args() 120 | 121 | # Calculate number of questions per category 122 | num_categories = len(categories) 123 | base_questions = args.num_questions // num_categories 124 | extra_questions = args.num_questions % num_categories 125 | 126 | # Generate questions, cycling through categories 127 | dataset = [] 128 | for i in range(num_categories): 129 | questions_to_generate = base_questions + (1 if i < extra_questions else 0) 130 | if questions_to_generate > 0: 131 | category_questions = generator.generate_dataset( 132 | [categories[i]], 133 | num_questions_per_category=questions_to_generate 134 | ) 135 | dataset.extend(category_questions) 136 | 137 | # Save to JSON file 138 | with open("jeopardy_dataset.json", "w") as f: 139 | json.dump(dataset, f, indent=2) 140 | 141 | print(f"Generated {len(dataset)} Jeopardy questions!") 142 | print("Dataset saved to jeopardy_dataset.json") 143 | -------------------------------------------------------------------------------- /math_calculator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dspy 4 | import math 5 | import json 6 | import time 7 | import tqdm 8 | from pprint import pprint 9 | from collections import Counter 10 | from concurrent.futures import ThreadPoolExecutor, as_completed 11 | from typing import List, Dict, Any 12 | from signatures import ( 13 | SolutionSelectorSignature, 14 | MathCalculationSignature, 15 | TaskSplitterSignature, 16 | SubtaskResultSelectorSignature 17 | ) 18 | from math_evaluator import MathEvaluator 19 | 20 | class MathCalculator(dspy.Module): 21 | """Base math calculator module that ProblemSolver extends""" 22 | def __init__(self): 23 | super().__init__() 24 | self.calculate = dspy.ChainOfThought(MathCalculationSignature) 25 | 26 | def forward(self, task): 27 | """Basic forward pass without advanced reasoning""" 28 | result = self.calculate(task=task) 29 | return dspy.Prediction( 30 | reasoning=result.reasoning, 31 | solution=result.solution, 32 | notes_output=result.notes_output 33 | ) 34 | 35 | class ProblemSolver(dspy.Module): 36 | def __init__(self, max_iterations=5, num_attempts=3, subtask_attempts=3): 37 | """Initialize the ProblemSolver with DSPy modules and configuration. 38 | 39 | Args: 40 | max_iterations: Maximum number of reasoning iterations per attempt 41 | num_attempts: Number of attempts to solve each task 42 | subtask_attempts: Number of attempts to solve each subtask 43 | """ 44 | super().__init__() 45 | # Initialize instance variables first 46 | self.max_iterations = max_iterations 47 | self.num_attempts = num_attempts 48 | self.subtask_attempts = subtask_attempts 49 | 50 | self.reasoning_tree = { 51 | 'root': None, 52 | 'nodes': {}, 53 | 'metadata': { 54 | 'start_time': time.time(), 55 | 'config': { 56 | 'max_iterations': self.max_iterations, 57 | 'num_attempts': self.num_attempts, 58 | 'subtask_attempts': self.subtask_attempts 59 | } 60 | } 61 | } 62 | self.current_node_id = 0 63 | 64 | # Initialize DSPy modules 65 | self.calculate = dspy.ChainOfThought(MathCalculationSignature) 66 | self.select_solution = dspy.ChainOfThought(SolutionSelectorSignature) 67 | self.split_task = dspy.ChainOfThought(TaskSplitterSignature) 68 | self.select_subtask_result = dspy.ChainOfThought(SubtaskResultSelectorSignature) 69 | 70 | def _create_node(self, task, parent_id=None, node_type='task', input_data=None, output_data=None): 71 | """Create a new node in the reasoning tree with input/output tracking""" 72 | node_id = f"node_{self.current_node_id}" 73 | self.current_node_id += 1 74 | 75 | node = { 76 | 'id': node_id, 77 | 'type': node_type, 78 | 'task': task, 79 | 'parent': parent_id, 80 | 'children': [], 81 | 'attempts': [], 82 | 'input': input_data if input_data else {}, 83 | 'output': output_data if output_data else {}, 84 | 'timestamp': time.time() 85 | } 86 | 87 | self.reasoning_tree['nodes'][node_id] = node 88 | 89 | if parent_id: 90 | self.reasoning_tree['nodes'][parent_id]['children'].append(node_id) 91 | 92 | if not self.reasoning_tree['root']: 93 | self.reasoning_tree['root'] = node_id 94 | 95 | return node_id 96 | 97 | def _split_task(self, task, depth=0, max_depth=3): 98 | """Split a general problem into subtasks using DSPy reasoning""" 99 | if depth >= max_depth: 100 | print(f"Max recursion depth {max_depth} reached for task: {task}") 101 | return [task] 102 | 103 | try: 104 | # Log task splitting attempt 105 | print(f"Attempting to split task (Depth {depth}): {task}") 106 | 107 | result = self.split_task(task=task, context="") 108 | if not hasattr(result, 'subtasks'): 109 | print(f"Failed to split task - no subtasks returned: {task}") 110 | return [task] 111 | 112 | # Parse subtasks from the output 113 | subtasks = [] 114 | if isinstance(result.subtasks, str): 115 | subtasks = [s.strip() for s in result.subtasks.split('\n') if s.strip()] 116 | elif isinstance(result.subtasks, list): 117 | subtasks = [str(s).strip() for s in result.subtasks if str(s).strip()] 118 | 119 | # Log the split reasoning and results 120 | print(f"Task Split Reasoning (Depth {depth}):\n{result.split_reasoning}") 121 | print(f"Generated Subtasks: {subtasks}") 122 | 123 | # Recursively split subtasks if needed 124 | final_subtasks = [] 125 | for subtask in subtasks: 126 | try: 127 | # Only split further if the subtask is complex enough 128 | if len(subtask.split()) > 5: # Simple heuristic based on length 129 | final_subtasks.extend(self._split_task(subtask, depth+1, max_depth)) 130 | else: 131 | final_subtasks.append(subtask) 132 | except Exception as e: 133 | print(f"Error recursively splitting subtask {subtask}: {e}") 134 | final_subtasks.append(subtask) 135 | 136 | return final_subtasks if final_subtasks else [task] 137 | except Exception as e: 138 | print(f"Error splitting task {task}: {e}") 139 | return [task] 140 | 141 | def _combine_subtask_results(self, subtask_results: List[dspy.Prediction]) -> Dict[str, Any]: 142 | """Combine results from DSPy-generated subtasks""" 143 | if not subtask_results: 144 | return dspy.Prediction( 145 | reasoning="No subtask results to combine", 146 | solution=None, 147 | notes_output="" 148 | ) 149 | 150 | # Build combined reasoning 151 | combined_reasoning = [] 152 | combined_solution = [] 153 | 154 | for i, result in enumerate(subtask_results, 1): 155 | combined_reasoning.append( 156 | f"Subtask {i}:\n" 157 | f"Reasoning: {result.reasoning}\n" 158 | f"Solution: {result.solution}\n" 159 | ) 160 | if result.solution: 161 | combined_solution.append(str(result.solution)) 162 | 163 | # Combine solutions in a meaningful way 164 | final_solution = "\n".join(combined_solution) if combined_solution else "No solution found" 165 | 166 | return dspy.Prediction( 167 | reasoning="Combined subtask results:\n" + "\n".join(combined_reasoning), 168 | solution=final_solution, 169 | notes_output="Combined results from subtasks" 170 | ) 171 | 172 | def forward(self, task): 173 | """Forward pass for the math calculator with recursive task splitting""" 174 | # First try to split the task into subtasks recursively 175 | subtasks = self._split_task(task, max_depth=3) # Set max recursion depth 176 | 177 | if len(subtasks) > 1: 178 | # Process each subtask independently with multiple attempts 179 | subtask_results = [] 180 | for subtask in subtasks: 181 | if subtask in ['+', '-', '*', '/', '^', '√', '%']: 182 | # Keep operators as-is 183 | subtask_results.append(dspy.Prediction( 184 | reasoning="Operator", 185 | solution=subtask, 186 | notes_output="" 187 | )) 188 | else: 189 | # Process numerical subtasks with multiple attempts 190 | result = self._process_subtask(subtask) 191 | subtask_results.append(result) 192 | 193 | # Combine subtask results 194 | final_solution = self._combine_subtask_results(subtask_results) 195 | final_reasoning = "\n".join( 196 | f"Subtask {i+1} ({subtask}):\n{r.reasoning}\nSolution: {r.solution}\n" 197 | for i, (subtask, r) in enumerate(zip(subtasks, subtask_results)) 198 | ) 199 | 200 | return dspy.Prediction( 201 | reasoning=f"Task split into {len(subtasks)} subtasks:\n{final_reasoning}", 202 | solution=final_solution, 203 | notes_output="Task split into subtasks" 204 | ) 205 | 206 | # Fall back to original processing if no subtasks found 207 | attempts = [] 208 | 209 | # Run multiple attempts 210 | for attempt in range(self.num_attempts): 211 | context = "" 212 | final_reasoning = "" 213 | final_solution = "" 214 | 215 | for iteration in range(self.max_iterations): 216 | try: 217 | result = self.calculate(task=task, context=context) 218 | 219 | # Validate required fields 220 | if not all(hasattr(result, field) for field in ['reasoning', 'solution', 'notes_output', 'iteration_control']): 221 | raise ValueError("Missing required fields in model output") 222 | 223 | # Accumulate reasoning 224 | final_reasoning += f"\nAttempt {attempt + 1}, Iteration {iteration + 1} Reasoning:\n{result.reasoning}" 225 | 226 | # Build context for next iteration 227 | iteration_context = ( 228 | f"Iteration {iteration + 1}:\n" 229 | f"Reasoning: {result.reasoning}\n" 230 | f"Solution: {result.solution}\n" 231 | f"Notes: {result.notes_output}\n" 232 | ) 233 | context += "\n" + iteration_context 234 | 235 | # Store the latest solution 236 | final_solution = result.solution 237 | 238 | # Check if we should terminate 239 | if result.iteration_control.lower().strip() == "terminate": 240 | break 241 | 242 | except ValueError as e: 243 | print(f"Validation error in attempt {attempt + 1}, iteration {iteration + 1}: {str(e)}") 244 | continue 245 | except RuntimeError as e: 246 | print(f"Runtime error in attempt {attempt + 1}, iteration {iteration + 1}: {str(e)}") 247 | continue 248 | except Exception as e: 249 | print(f"Unexpected error in attempt {attempt + 1}, iteration {iteration + 1}: {str(e)}") 250 | continue 251 | 252 | attempts.append({ 253 | 'reasoning': final_reasoning, 254 | 'solution': final_solution, 255 | 'notes_output': context 256 | }) 257 | 258 | # Select the best solution 259 | selection_result = self.select_solution( 260 | task=task, 261 | solutions=[f"Attempt {i+1}:\nReasoning: {a['reasoning']}\nSolution: {a['solution']}" 262 | for i, a in enumerate(attempts)], 263 | selection_criteria="Select the solution that is mathematically correct, logically consistent, " 264 | "has clear reasoning, and provides a complete solution to the task" 265 | ) 266 | 267 | # Find the selected solution 268 | selected_solution = selection_result.selected_solution 269 | selection_reasoning = selection_result.selection_reasoning 270 | 271 | # Try to match the selected solution 272 | for attempt in attempts: 273 | if attempt['solution'] == selected_solution: 274 | # Add selection reasoning to the final output 275 | final_reasoning = ( 276 | f"Selected Solution Reasoning:\n{selection_reasoning}\n\n" 277 | f"Solution Details:\n{attempt['reasoning']}" 278 | ) 279 | return dspy.Prediction( 280 | reasoning=final_reasoning, 281 | solution=attempt['solution'], 282 | notes_output=attempt['notes_output'] 283 | ) 284 | 285 | # If no solution was selected, choose the most consistent one 286 | if len(attempts) > 1: 287 | # Find the most common solution 288 | solution_counts = Counter(a['solution'] for a in attempts) 289 | most_common_solution = solution_counts.most_common(1)[0][0] 290 | 291 | # Return the first attempt with the most common solution 292 | for attempt in attempts: 293 | if attempt['solution'] == most_common_solution: 294 | final_reasoning = ( 295 | "No clear selection - using most consistent solution:\n" 296 | f"Solution appeared {solution_counts[most_common_solution]} times\n\n" 297 | f"Solution Details:\n{attempt['reasoning']}" 298 | ) 299 | return dspy.Prediction( 300 | reasoning=final_reasoning, 301 | solution=attempt['solution'], 302 | notes_output=attempt['notes_output'] 303 | ) 304 | 305 | # Fall back to the first attempt 306 | final_reasoning = ( 307 | "Using first attempt as fallback solution\n\n" 308 | f"Solution Details:\n{attempts[0]['reasoning']}" 309 | ) 310 | return dspy.Prediction( 311 | reasoning=final_reasoning, 312 | solution=attempts[0]['solution'], 313 | notes_output=attempts[0]['notes_output'] 314 | ) 315 | 316 | def evaluate_on_dataset(self, dataset_path="math_dataset.json", max_iter=None, num_threads=10): 317 | evaluator = MathEvaluator(self, num_threads) 318 | return evaluator.evaluate_on_dataset(dataset_path) 319 | 320 | def _process_subtask(self, subtask, parent_id=None): 321 | """Process a subtask with multiple attempts and select the best result""" 322 | # Create node for this subtask 323 | subtask_node_id = self._create_node( 324 | task=subtask, 325 | parent_id=parent_id, 326 | node_type='subtask', 327 | input_data={ 328 | 'subtask': subtask, 329 | 'parent_id': parent_id, 330 | 'timestamp': time.time() 331 | } 332 | ) 333 | 334 | attempts = [] 335 | 336 | for attempt in range(self.subtask_attempts): 337 | # Create node for this attempt 338 | attempt_node_id = self._create_node( 339 | task=subtask, 340 | parent_id=subtask_node_id, 341 | node_type='attempt', 342 | input_data={ 343 | 'attempt_number': attempt + 1, 344 | 'subtask': subtask, 345 | 'timestamp': time.time() 346 | } 347 | ) 348 | try: 349 | result = self._forward_with_max_iter(subtask, self.max_iterations) 350 | 351 | # Update attempt node with output 352 | self.reasoning_tree['nodes'][attempt_node_id]['output'] = { 353 | 'reasoning': result.reasoning, 354 | 'solution': result.solution, 355 | 'notes': result.notes_output, 356 | 'timestamp': time.time() 357 | } 358 | 359 | attempts.append({ 360 | 'reasoning': result.reasoning, 361 | 'solution': result.solution, 362 | 'notes': result.notes_output 363 | }) 364 | except Exception as e: 365 | print(f"Error in subtask attempt {attempt + 1}: {e}") 366 | continue 367 | 368 | # Select the best result using DSPy 369 | if len(attempts) > 1: 370 | selection_result = self.select_subtask_result( 371 | subtask=subtask, 372 | attempts=[f"Attempt {i+1}:\nReasoning: {a['reasoning']}\nSolution: {a['solution']}" 373 | for i, a in enumerate(attempts)] 374 | ) 375 | 376 | # Find the selected solution 377 | for attempt in attempts: 378 | if attempt['solution'] == selection_result.selected_solution: 379 | return dspy.Prediction( 380 | reasoning=f"Selected Solution Reasoning:\n{selection_result.selection_reasoning}\n\n" 381 | f"Solution Details:\n{attempt['reasoning']}", 382 | solution=attempt['solution'], 383 | notes_output=attempt['notes'] 384 | ) 385 | 386 | # If no selection or only one attempt, return the first result 387 | if attempts: 388 | return dspy.Prediction( 389 | reasoning=attempts[0]['reasoning'], 390 | solution=attempts[0]['solution'], 391 | notes_output=attempts[0]['notes'] 392 | ) 393 | 394 | # Fallback if all attempts failed 395 | return dspy.Prediction( 396 | reasoning="All attempts failed to solve the subtask", 397 | solution="0", 398 | notes_output="" 399 | ) 400 | 401 | def _forward_with_max_iter(self, task, max_iter): 402 | """Modified forward pass with configurable max iterations""" 403 | context = "" 404 | final_reasoning = "" 405 | final_solution = "" 406 | 407 | for iteration in range(max_iter): 408 | try: 409 | result = self.calculate(task=task, context=context) 410 | 411 | # Validate required fields 412 | if not all(hasattr(result, field) for field in ['reasoning', 'solution', 'notes_output', 'iteration_control']): 413 | raise ValueError("Missing required fields in model output") 414 | 415 | # Accumulate reasoning 416 | final_reasoning += f"\nIteration {iteration + 1} Reasoning:\n{result.reasoning}" 417 | 418 | # Build context for next iteration 419 | iteration_context = ( 420 | f"Iteration {iteration + 1}:\n" 421 | f"Reasoning: {result.reasoning}\n" 422 | f"Solution: {result.solution}\n" 423 | f"Notes: {result.notes_output}\n" 424 | ) 425 | context += "\n" + iteration_context 426 | 427 | # Store the latest solution 428 | final_solution = result.solution 429 | 430 | # Check if we should terminate 431 | if result.iteration_control.lower().strip() == "terminate": 432 | break 433 | 434 | except ValueError as e: 435 | print(f"Validation error in iteration {iteration + 1}: {str(e)}") 436 | continue 437 | except RuntimeError as e: 438 | print(f"Runtime error in iteration {iteration + 1}: {str(e)}") 439 | continue 440 | except Exception as e: 441 | print(f"Unexpected error in iteration {iteration + 1}: {str(e)}") 442 | continue 443 | 444 | return dspy.Prediction( 445 | reasoning=final_reasoning, 446 | solution=final_solution, 447 | notes_output=context 448 | ) 449 | 450 | def save_reasoning_tree(self, path="reasoning_tree.json"): 451 | """Save the full reasoning tree to a JSON file with enhanced details""" 452 | # Add final metadata 453 | self.reasoning_tree['metadata']['end_time'] = time.time() 454 | self.reasoning_tree['metadata']['duration'] = ( 455 | self.reasoning_tree['metadata']['end_time'] - 456 | self.reasoning_tree['metadata']['start_time'] 457 | ) 458 | 459 | # Save with pretty printing 460 | with open(path, "w") as f: 461 | json.dump(self.reasoning_tree, f, indent=2, sort_keys=True) 462 | 463 | print(f"Reasoning tree saved to {path}") 464 | print(f"Total nodes: {len(self.reasoning_tree['nodes'])}") 465 | print(f"Duration: {self.reasoning_tree['metadata']['duration']:.2f}s") 466 | 467 | def _is_correct(self, predicted, expected): 468 | """Compare solutions with tolerance for floating point""" 469 | try: 470 | # Handle both string and numeric inputs 471 | predicted_num = float(predicted) if isinstance(predicted, str) else float(predicted) 472 | expected_num = float(expected) if isinstance(expected, str) else float(expected) 473 | 474 | # Handle NaN and infinity 475 | if math.isnan(predicted_num) or math.isnan(expected_num): 476 | return False 477 | if math.isinf(predicted_num) or math.isinf(expected_num): 478 | return False 479 | 480 | # Compare with tolerance 481 | return abs(predicted_num - expected_num) < 0.01 482 | except (ValueError, TypeError) as e: 483 | print(f"⚠️ Error evaluating solution - invalid number format: {str(e)}") 484 | return False 485 | except Exception as e: 486 | print(f"⚠️ Unexpected error evaluating solution: {str(e)}") 487 | return False 488 | 489 | 490 | if __name__ == "__main__": 491 | # Configure DSPy 492 | lm = dspy.LM(model="deepseek/deepseek-chat", temperature=0.3, cache=False) 493 | dspy.settings.configure(lm=lm) 494 | 495 | # Create calculator instance with subtask processing 496 | calculator = ProblemSolver(max_iterations=3, num_attempts=2, subtask_attempts=2) 497 | 498 | # Test complex task that should be split into subtasks 499 | complex_task = "Calculate (3 + 4) * (5 - 2) / (6 + 3)" 500 | 501 | print(f"\nProcessing complex task: {complex_task}") 502 | result = calculator.forward(complex_task) 503 | 504 | print("\nFinal Result:") 505 | print(f"Reasoning:\n{result.reasoning}") 506 | print(f"Solution: {result.solution}") 507 | 508 | # Save result 509 | with open("subtask_result.json", "w") as f: 510 | json.dump({ 511 | "task": complex_task, 512 | "reasoning": result.reasoning, 513 | "solution": result.solution 514 | }, f, indent=2) 515 | print("\nResult saved to subtask_result.json") 516 | -------------------------------------------------------------------------------- /math_calculator_optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dspy 4 | import json 5 | from dspy.teleprompt import MIPROv2, BootstrapFewShotWithRandomSearch, BootstrapFewShot 6 | from math_calculator import MathCalculator, MathCalculationSignature 7 | import tqdm 8 | import logging 9 | 10 | logging.basicConfig(filename='optimization_log.txt', level=logging.INFO) 11 | 12 | # Set global tqdm configuration 13 | tqdm.tqdm.pandas() 14 | tqdm.tqdm.get_lock().locks = [] 15 | tqdm.tqdm.ncols = 60 16 | 17 | class MathOptimizer: 18 | def __init__(self): 19 | self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.5, cache=False) 20 | dspy.settings.configure(lm=self.lm) 21 | self.calculator = MathCalculator() 22 | self.student = None 23 | self.teacher = None 24 | 25 | def set_student(self, student): 26 | """Set the student model for optimization""" 27 | self.student = student 28 | 29 | def set_teacher(self, teacher): 30 | """Set the teacher model for optimization""" 31 | self.teacher = teacher 32 | 33 | def load_dataset(self, dataset_path="math_dataset.json"): 34 | with open(dataset_path) as f: 35 | dataset = json.load(f) 36 | # First 100 samples for validation, rest for training 37 | return dataset[100:], dataset[:100] 38 | 39 | def create_trainset(self, dataset): 40 | trainset = [] 41 | # for item in tqdm.tqdm(dataset[:100], ncols=60): 42 | for item in dataset: 43 | trainset.append(dspy.Example( 44 | task=item['task'], 45 | solution=item['solution'] 46 | ).with_inputs('task')) 47 | return trainset 48 | 49 | def optimize(self, trainset, num_candidates=10, base_model=None): 50 | # Define the metric function with subtask reasoning evaluation 51 | def metric(example, prediction, trace=None): 52 | try: 53 | # Handle both string and numeric solutions 54 | pred_solution = float(prediction.solution) if isinstance(prediction.solution, str) else prediction.solution 55 | exp_solution = float(example.solution) if isinstance(example.solution, str) else example.solution 56 | 57 | # Compare with tolerance for floating point numbers 58 | accuracy = int(abs(pred_solution - exp_solution) < 0.01) 59 | 60 | # Evaluate subtask reasoning quality 61 | if hasattr(prediction, 'reasoning'): 62 | reasoning = prediction.reasoning.lower() 63 | # Check for subtask indicators 64 | if 'subtask' in reasoning or 'step' in reasoning or 'part' in reasoning: 65 | # Additional points for using subtask reasoning 66 | accuracy += 1 67 | # Check for proper combination of subtasks 68 | if 'combine' in reasoning or 'final result' in reasoning: 69 | accuracy += 1 70 | 71 | return min(accuracy, 1) # Cap at 1 to maintain binary metric 72 | except (ValueError, TypeError, AttributeError) as e: 73 | print(f"Metric error: {e}") 74 | return 0 75 | 76 | # Configure MIPRO optimizer with subtask reasoning focus 77 | teleprompter = MIPROv2( 78 | metric=metric, 79 | num_candidates=num_candidates, 80 | init_temperature=1.0, 81 | prompt_model=self.lm, 82 | task_model=self.lm, 83 | num_threads=100, 84 | auto='light', 85 | track_stats=True 86 | ) 87 | 88 | # Set student and teacher if not already set 89 | if self.student is None: 90 | self.set_student(base_model) 91 | if self.teacher is None: 92 | self.set_teacher(base_model) 93 | 94 | # Run optimization with subtask reasoning focus 95 | optimized_calculator = teleprompter.compile( 96 | student=self.student, 97 | teacher=self.teacher, 98 | trainset=trainset, 99 | num_trials=7, 100 | max_bootstrapped_demos=3, 101 | max_labeled_demos=4, 102 | requires_permission_to_run=False, 103 | minibatch=True, 104 | ) 105 | 106 | 107 | return optimized_calculator 108 | 109 | def save_optimized_model(self, optimized_calculator, path="optimized_models/optimized_math_calculator.json"): 110 | optimized_calculator.save(path) 111 | print(f"Optimized model saved to {path}") 112 | 113 | from concurrent.futures import ThreadPoolExecutor, as_completed 114 | 115 | def evaluate_single_task(calculator, item): 116 | try: 117 | result = calculator.forward(item['task']) 118 | # Handle both string and numeric solutions 119 | pred_solution = float(result.solution) if isinstance(result.solution, str) else result.solution 120 | exp_solution = float(item['solution']) if isinstance(item['solution'], str) else item['solution'] 121 | 122 | # Compare with tolerance for floating point numbers 123 | return int(abs(pred_solution - exp_solution) < 0.01) 124 | except (ValueError, TypeError, AttributeError) as e: 125 | print(f"Evaluation error for task {item['task']}: {e}") 126 | return 0 127 | 128 | def evaluate_model(calculator, dataset, num_threads=10): 129 | correct = 0 130 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 131 | futures = [ 132 | executor.submit(evaluate_single_task, calculator, item) 133 | for item in dataset[:100] 134 | ] 135 | for future in tqdm.tqdm(as_completed(futures), total=len(futures), 136 | ncols=60): 137 | correct += future.result() 138 | return correct / 100 # Return accuracy 139 | 140 | if __name__ == "__main__": 141 | optimizer = MathOptimizer() 142 | 143 | # Load and split dataset 144 | train_data, val_data = optimizer.load_dataset() 145 | trainset = optimizer.create_trainset(train_data) 146 | 147 | # Initialize results tracking 148 | results = [] 149 | current_calculator = optimizer.calculator 150 | 151 | # Evaluate initial model on validation set 152 | print("Evaluating initial model...") 153 | initial_accuracy = evaluate_model(current_calculator, val_data, num_threads=20) 154 | results.append(("Initial", initial_accuracy)) 155 | print(f"Initial accuracy: {initial_accuracy:.1%}") 156 | 157 | current_calculator_student = current_calculator.deepcopy() 158 | 159 | # Run multiple optimization iterations with memory cleanup 160 | num_iterations = 1 161 | for i in range(num_iterations): 162 | print(f"\nStarting optimization iteration {i+1}/{num_iterations}...") 163 | current_calculator = current_calculator.deepcopy() 164 | current_calculator = current_calculator.reset_copy() 165 | 166 | # Set student and teacher for this iteration 167 | optimizer.set_student(current_calculator_student) 168 | optimizer.set_teacher(current_calculator) 169 | 170 | # Run optimization on current calculator 171 | optimized_calculator = optimizer.optimize(trainset, num_candidates=3, base_model=current_calculator) 172 | 173 | # Evaluate optimized model on validation set 174 | accuracy = evaluate_model(optimized_calculator, val_data, num_threads=20) 175 | 176 | # Explicit memory cleanup 177 | del current_calculator 178 | import gc 179 | gc.collect() 180 | current_calculator = optimized_calculator 181 | results.append((f"Iteration {i+1}", accuracy)) 182 | print(f"Optimization iteration {i+1} accuracy: {accuracy:.1%}") 183 | 184 | # Save optimized model to optimized_models directory 185 | import os 186 | os.makedirs("optimized_models", exist_ok=True) 187 | model_path = f"optimized_models/optimized_math_calculator_iter{i+1}.json" 188 | optimizer.save_optimized_model(optimized_calculator, model_path) 189 | 190 | # Set as current calculator for next iteration 191 | current_calculator = optimized_calculator 192 | 193 | # Print all results 194 | print("\nFinal Results:") 195 | for stage, accuracy in results: 196 | print(f"{stage}: {accuracy:.1%}") 197 | 198 | print("Optimization complete!") 199 | -------------------------------------------------------------------------------- /math_dataset_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dspy 4 | import math 5 | import random 6 | import json 7 | from tqdm import tqdm 8 | 9 | class MathDatasetGenerator: 10 | def __init__(self): 11 | # Configurable parameters for difficulty 12 | self.basic_operators = ['+', '-', '*', '/'] 13 | self.advanced_operators = ['^', '√', '%'] # Exponentiation, square root, modulo 14 | self.use_advanced_ops = False # Toggle advanced operators 15 | self.parentheses_prob = 0.3 # Probability of adding parentheses 16 | self.min_num = -10000 # Minimum number value 17 | self.max_num = 10000 # Maximum number value 18 | self.min_ops = 5 # Minimum operations per expression 19 | self.max_ops = 15 # Maximum operations per expression 20 | self.allow_decimals = False # Allow decimal numbers 21 | self.allow_negatives = False # Allow negative numbers 22 | self.allow_variables = False # Include variables in expressions 23 | self.variables = ['x', 'y', 'z'] # Available variables 24 | 25 | def _generate_number(self): 26 | if self.allow_decimals: 27 | return round(random.uniform(self.min_num, self.max_num), 2) 28 | return random.randint(self.min_num, self.max_num) 29 | 30 | def _generate_expression(self): 31 | # Generate number of operations 32 | num_ops = random.randint(self.min_ops, self.max_ops) 33 | 34 | # Choose starting element (number or variable) 35 | if self.allow_variables and random.random() < 0.3: # 30% chance to start with variable 36 | expression = random.choice(self.variables) 37 | else: 38 | expression = str(self._generate_number()) 39 | 40 | for _ in range(num_ops): 41 | # Choose operator 42 | if self.use_advanced_ops and random.random() < 0.5: # 50% chance for advanced op 43 | op = random.choice(self.advanced_operators) 44 | else: 45 | op = random.choice(self.basic_operators) 46 | 47 | # Choose next element (number or variable) 48 | if self.allow_variables and random.random() < 0.3: # 30% chance for variable 49 | next_element = random.choice(self.variables) 50 | else: 51 | next_element = str(self._generate_number()) 52 | 53 | # Handle special operators 54 | if op == '√': # Square root 55 | expression = f"{op}({expression})" 56 | elif op == '^': # Exponentiation 57 | expression = f"({expression}){op}{next_element}" 58 | else: 59 | # Decide whether to add parentheses 60 | if random.random() < self.parentheses_prob: 61 | expression = f"({expression} {op} {next_element})" 62 | else: 63 | expression = f"{expression} {op} {next_element}" 64 | 65 | return expression 66 | 67 | def generate_dataset(self, num_tasks=100): 68 | dataset = [] 69 | 70 | for _ in tqdm(range(num_tasks), desc="Generating Math Tasks", ncols=60): 71 | expression = self._generate_expression() 72 | 73 | # Calculate solution using eval (safe since we control the input) 74 | try: 75 | # Use a safer expression evaluator 76 | from ast import literal_eval 77 | try: 78 | # First try evaluating as-is 79 | solution = literal_eval(expression) 80 | except (ValueError, SyntaxError): 81 | # If that fails, try evaluating as a math expression 82 | import operator 83 | import math 84 | allowed_operators = { 85 | '+': operator.add, 86 | '-': operator.sub, 87 | '*': operator.mul, 88 | '/': operator.truediv, 89 | '^': operator.pow, 90 | '%': operator.mod, 91 | '√': math.sqrt 92 | } 93 | # Parse and evaluate the expression safely 94 | stack = [] 95 | for token in expression.split(): 96 | if token in allowed_operators: 97 | if token == '√': 98 | operand = stack.pop() 99 | stack.append(allowed_operators[token](operand)) 100 | else: 101 | right = stack.pop() 102 | left = stack.pop() 103 | stack.append(allowed_operators[token](left, right)) 104 | else: 105 | try: 106 | stack.append(float(token)) 107 | except ValueError: 108 | stack.append(0) # Default to 0 for invalid tokens 109 | solution = stack[0] if stack else 0 110 | # Round to 2 decimal places for division results 111 | if isinstance(solution, float): 112 | solution = round(solution, 2) 113 | 114 | dataset.append({ 115 | 'task': expression, 116 | 'solution': solution 117 | }) 118 | except ZeroDivisionError: 119 | continue 120 | 121 | 122 | return dataset 123 | 124 | if __name__ == "__main__": 125 | # Generate dataset 126 | generator = MathDatasetGenerator() 127 | dataset = generator.generate_dataset(num_tasks=10000) 128 | 129 | # Save to file 130 | with open("math_dataset.json", "w") as f: 131 | json.dump(dataset, f, indent=2) 132 | 133 | print(f"Generated {len(dataset)} math tasks. Saved to math_dataset.json") 134 | -------------------------------------------------------------------------------- /math_evaluator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import tqdm 4 | from collections import Counter 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | 7 | class MathEvaluator: 8 | def __init__(self, calculator, num_threads=10, max_samples=100): 9 | self.calculator = calculator 10 | self.num_threads = num_threads 11 | self.max_samples = max_samples 12 | 13 | def evaluate_single_task(self, item): 14 | task = item['task'] 15 | expected_solution = item['solution'] 16 | 17 | iter_start = time.time() 18 | result = self.calculator._forward_with_max_iter(task, max_iter=self.calculator.max_iterations) 19 | elapsed = time.time() - iter_start 20 | 21 | correct = self.calculator._is_correct(result.solution, expected_solution) 22 | return correct, elapsed 23 | 24 | def evaluate_on_dataset(self, dataset_path="math_dataset.json"): 25 | start_time = time.time() 26 | 27 | with open(dataset_path) as f: 28 | dataset = json.load(f) 29 | 30 | dataset = dataset[:self.max_samples] if hasattr(self, 'max_samples') else dataset[:100] 31 | 32 | results = { 33 | "correct": 0, 34 | "time": 0 35 | } 36 | 37 | with ThreadPoolExecutor(max_workers=self.num_threads) as executor: 38 | futures = [ 39 | executor.submit(self.evaluate_single_task, item) 40 | for item in dataset 41 | ] 42 | 43 | for i, future in enumerate(tqdm.tqdm(as_completed(futures), total=len(futures), ncols=60), 1): 44 | correct, elapsed = future.result() 45 | results["correct"] += int(correct) 46 | results["time"] += elapsed 47 | 48 | if i % 100 == 0: 49 | print(f"\nProgress after {i} samples:") 50 | print(f"Correct: {results['correct']}/{i} ({results['correct']/i:.1%})") 51 | print(f"Time: {results['time']:.2f}s") 52 | 53 | total_time = time.time() - start_time 54 | results["accuracy"] = results["correct"] / len(dataset) 55 | results["total_time"] = total_time 56 | results["max_iter"] = self.calculator.max_iterations 57 | 58 | print("\nEvaluation Results:") 59 | print(f"Max Iterations: {self.calculator.max_iterations}") 60 | print(f"Correct Answers: {results['correct']}/{len(dataset)} ({results['accuracy']:.1%})") 61 | print(f"Total Time: {results['total_time']:.2f}s") 62 | 63 | with open("math_calculator_benchmark.json", "w") as f: 64 | json.dump(results, f, indent=2) 65 | 66 | print("\nBenchmark results saved to math_calculator_benchmark.json") 67 | return results 68 | -------------------------------------------------------------------------------- /math_multiplication_optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dspy 4 | import random 5 | import numpy as np 6 | from typing import List 7 | from dspy.teleprompt import MIPROv2 8 | from tqdm import tqdm 9 | 10 | class MultiplicationSignature(dspy.Signature): 11 | """Solve multiplication problems step by step.""" 12 | task = dspy.InputField(desc="multiplication task as a string") 13 | solution = dspy.OutputField(desc="final solution as a number") 14 | 15 | class MultiplicationSolver(dspy.Module): 16 | def __init__(self): 17 | super().__init__() 18 | self.generate_answer = dspy.ChainOfThought(MultiplicationSignature) 19 | 20 | def forward(self, task): 21 | return self.generate_answer(task=task) 22 | 23 | def generate_multiplication_dataset(num_samples=1000) -> List[dspy.Example]: 24 | """Generate multiplication problems with solutions.""" 25 | dataset = [] 26 | for _ in range(num_samples): 27 | # a = random.randint(1, 10000) 28 | # b = random.randint(1, 10000) 29 | max_num = int(1e5) 30 | a = random.randint(1, max_num) 31 | b = random.randint(1, max_num) 32 | task = f"{a} * {b}" 33 | solution = a * b 34 | dataset.append(dspy.Example(task=task, solution=solution).with_inputs('task')) 35 | return dataset 36 | 37 | def evaluate_multiplication(example, prediction, trace=None): 38 | """Evaluate if predicted solution matches expected.""" 39 | try: 40 | pred = float(prediction.solution) 41 | exp = float(example.solution) 42 | return int(abs(pred - exp) < 0.01) 43 | except: 44 | return 0 45 | 46 | def optimize_multiplication_solver(): 47 | # Configure language model 48 | lm = dspy.LM(model="deepseek/deepseek-chat", temperature=0.3, cache=False) 49 | dspy.settings.configure(lm=lm) 50 | 51 | # Generate dataset 52 | dataset = generate_multiplication_dataset(1000) 53 | trainset = dataset[:800] # 80% training 54 | devset = dataset[800:] # 20% validation 55 | 56 | # Initialize MIPROv2 optimizer 57 | teleprompter = MIPROv2( 58 | metric=evaluate_multiplication, 59 | num_candidates=3, 60 | num_threads=10, 61 | max_bootstrapped_demos=3, 62 | max_labeled_demos=4, 63 | # auto='light' 64 | auto='medium' 65 | ) 66 | 67 | # Create and optimize solver 68 | student = MultiplicationSolver() 69 | optimized_solver = teleprompter.compile( 70 | student, 71 | trainset=trainset, 72 | valset=devset, 73 | requires_permission_to_run=False 74 | ) 75 | 76 | # Evaluate on validation set 77 | correct = 0 78 | for example in devset: 79 | prediction = optimized_solver(example.task) 80 | correct += evaluate_multiplication(example, prediction) 81 | 82 | accuracy = correct / len(devset) 83 | print(f"Validation accuracy: {accuracy:.1%}") 84 | # vs unoptimized solver 85 | student = MultiplicationSolver() 86 | for example in devset: 87 | prediction = student(example.task) 88 | correct += evaluate_multiplication(example, prediction) 89 | 90 | accuracy = correct / len(devset) 91 | print(f"Unoptimized accuracy: {accuracy:.1%}") 92 | 93 | return optimized_solver 94 | 95 | from dspy.evaluate import Evaluate 96 | 97 | 98 | class LLMProgram(dspy.Module): 99 | def __init__(self): 100 | super().__init__() 101 | self.solver = dspy.ChainOfThought('task -> solution') 102 | 103 | def forward(self, task): 104 | return self.solver(task=task) 105 | 106 | def quick_optimize(): 107 | dspy.settings.configure(lm=dspy.LM(model="deepseek/deepseek-chat")) 108 | dataset = [dspy.Example(task=f"{a}*{b}", solution=a*b).with_inputs('task') 109 | for a,b in zip(np.random.randint(1e5,1e6,1000), 110 | np.random.randint(1e5,1e6,1000))] 111 | train, val = dataset[:800], dataset[800:] # 80/20 split 112 | # metric = lambda e,p,trace=None: int(abs(float(p.solution)-float(e.solution))<0.01) 113 | metric = lambda e,p,trace=None: int(abs(float(p.solution.replace(',',''))-float(e.solution))<0.01) 114 | 115 | llm_program = LLMProgram() 116 | compiled_llm_program = MIPROv2(metric=metric, num_threads=100, auto='heavy').compile( 117 | llm_program, trainset=train, valset=val) 118 | 119 | accuracy = sum(metric(e, compiled_llm_program(e.task)) for e in val) / len(val) 120 | print(f"Optimized accuracy: {accuracy:.1%}") 121 | 122 | 123 | 124 | # Evaluate unoptimized solver 125 | student = MultiplicationSolver() 126 | correct = 0 127 | for example in tqdm(val): 128 | try: 129 | prediction = student(example.task) 130 | correct += metric(example, prediction) 131 | except ValueError as e: print(e); pass 132 | 133 | accuracy = correct / len(val) 134 | print(f"Unoptimized accuracy: {accuracy:.1%}") 135 | evaluate = Evaluate(devset=train[:], metric=metric, num_threads=100, display_progress=True, 136 | display_table=True) 137 | evaluate(student, devset=train[:]) 138 | 139 | 140 | if __name__ == "__main__": 141 | # solver = optimize_multiplication_solver() 142 | quick_optimize() 143 | -------------------------------------------------------------------------------- /reasoning_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import dspy 3 | 4 | # Step 1: Configure the LM to use DeepSeek with temperature=1 and no caching 5 | lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1, cache=False) # Use DeepSeek as the LM 6 | dspy.settings.configure(lm=lm) 7 | 8 | 9 | action_list = ['reasoning', 'terminate'] 10 | # Step 2: Define the Signature for Core Reasoning 11 | class ReasoningSignature(dspy.Signature): 12 | context = dspy.InputField(desc="The context to reason about") 13 | objective = dspy.InputField(desc="The objective to achieve") 14 | reasoning = dspy.OutputField(desc="The reasoning process including step-by-step calculations") 15 | reasoning_output = dspy.OutputField( 16 | desc="The final output of the reasoning process. If no specific output, repeat the reasoning conclusion.", 17 | optional=True 18 | ) 19 | informal_proof = dspy.OutputField( 20 | desc="A numbered list of steps for the informal proof. If no proof needed, summarize the reasoning steps.", 21 | optional=True 22 | ) 23 | 24 | # Define Signature for Analysis 25 | class RequirementsSignature(dspy.Signature): 26 | context = dspy.InputField(desc="The context of the reasoning") 27 | objective = dspy.InputField(desc="The objective to achieve") 28 | current_requirements = dspy.InputField(desc="List of current requirements to achieve the objective") 29 | new_requirements = dspy.OutputField( 30 | desc="List of new requirements to add to achieve the objective. Return an empty list if no new requirements are needed.", 31 | default=[] 32 | ) 33 | unnecessary_requirements = dspy.OutputField( 34 | desc="List of requirements that are no longer needed to achieve the objective. Return an empty list if no requirements should be removed.", 35 | default=[] 36 | ) 37 | action = dspy.OutputField( 38 | desc="The action to take: 'add_requirements' if new requirements are needed, 'remove_requirements' if requirements should be removed, or 'stop' if requirements are complete", 39 | default="stop" 40 | ) 41 | 42 | class ReasoningAnalysisSignature(dspy.Signature): 43 | context = dspy.InputField(desc="The context of the reasoning") 44 | reasoning = dspy.InputField(desc="The reasoning process to analyze") 45 | reasoning_output = dspy.InputField(desc="The output of the reasoning process") 46 | informal_proof = dspy.InputField(desc="The numbered list of proof steps to analyze") 47 | 48 | proof_line_analysis = dspy.OutputField( 49 | desc="Detailed analysis of each proof line, checking if it makes logical sense and is mathematically correct" 50 | ) 51 | 52 | objective_achieved_analysis = dspy.OutputField( 53 | desc="Analysis of whether the objective was fully achieved" 54 | ) 55 | 56 | objective_achieved_confidence = dspy.OutputField( 57 | desc="Confidence score from 1-10 where 1 means extremely sure objective was not achieved and 10 means objective was definitely achieved" 58 | ) 59 | 60 | is_valid_reasoning = dspy.OutputField( 61 | desc="True if the reasoning in the input is valid and reaches the correct conclusion" 62 | ) 63 | 64 | action = dspy.OutputField( 65 | desc="The action to take, must be either 'reasoning' or 'terminate'" 66 | ) 67 | 68 | 69 | # Step 3: Create a Module with the Signature 70 | class RequirementsGenerator(dspy.Module): 71 | def __init__(self): 72 | super().__init__() 73 | self.generate_requirements = dspy.ChainOfThought(RequirementsSignature) 74 | 75 | def forward(self, context, objective, current_requirements): 76 | result = self.generate_requirements( 77 | context=context, 78 | objective=objective, 79 | current_requirements=current_requirements 80 | ) 81 | return result 82 | 83 | class ActionReasoning(dspy.Module): 84 | def __init__(self): 85 | super().__init__() 86 | # Use ChainOfThought for core reasoning 87 | self.generate_action = dspy.ChainOfThought(ReasoningSignature) 88 | # Separate module for analysis 89 | self.analyze_reasoning = dspy.ChainOfThought(ReasoningAnalysisSignature) 90 | # Module for requirements generation 91 | self.requirements_generator = RequirementsGenerator() 92 | 93 | def forward(self, context, objective): 94 | # First generate the reasoning 95 | reasoning_result = self.generate_action(context=context, objective=objective) 96 | 97 | # Handle missing fields 98 | reasoning = getattr(reasoning_result, "reasoning", "No reasoning provided") 99 | reasoning_output = getattr(reasoning_result, "reasoning_output", reasoning) 100 | informal_proof = getattr(reasoning_result, "informal_proof", reasoning) 101 | 102 | # Then analyze the reasoning and proof 103 | analysis_result = self.analyze_reasoning( 104 | context=context, 105 | reasoning=reasoning, 106 | reasoning_output=reasoning_output, 107 | informal_proof=informal_proof 108 | ) 109 | 110 | # Handle missing analysis fields 111 | objective_achieved_analysis = getattr(analysis_result, "objective_achieved_analysis", "No analysis provided") 112 | objective_achieved_confidence = getattr(analysis_result, "objective_achieved_confidence", 5) 113 | is_valid_reasoning = getattr(analysis_result, "is_valid_reasoning", "unknown") 114 | action = getattr(analysis_result, "action", "reasoning") 115 | proof_line_analysis = getattr(analysis_result, "proof_line_analysis", "No proof line analysis provided") 116 | 117 | combined = { 118 | "reasoning": reasoning, 119 | "reasoning_output": reasoning_output, 120 | "informal_proof": informal_proof, 121 | "objective_achieved_analysis": objective_achieved_analysis, 122 | "objective_achieved_confidence": objective_achieved_confidence, 123 | "is_valid_reasoning": is_valid_reasoning, 124 | "action": action, 125 | "proof_line_analysis": proof_line_analysis 126 | } 127 | return dspy.Prediction(**combined) 128 | 129 | # Step 4: Create an Instance of the Pipeline 130 | reasoning_pipeline = ActionReasoning() 131 | 132 | def generate_requirements(context, objective): 133 | """Iteratively generate and refine requirements for achieving an objective""" 134 | requirements = [] 135 | iteration = 1 136 | max_iterations = 10 137 | 138 | while iteration <= max_iterations: 139 | # If we hit max iterations, reset completely and try again 140 | if iteration == max_iterations: 141 | print("\nWarning: Reached max iterations. Resetting requirements and starting fresh.") 142 | requirements = [] 143 | iteration = 1 144 | continue 145 | 146 | print(f"\n--- Requirements Iteration {iteration} ---") 147 | print("Current Requirements:") 148 | for i, req in enumerate(requirements, 1): 149 | print(f"{i}. {req}") 150 | 151 | # Generate new requirements 152 | result = RequirementsGenerator()( 153 | context=context, 154 | objective=objective, 155 | current_requirements=requirements 156 | ) 157 | 158 | # Process new requirements 159 | if result.new_requirements: 160 | if isinstance(result.new_requirements, str): 161 | # Split string into list items and filter out non-requirement statements 162 | new_reqs = [ 163 | req.strip() for req in result.new_requirements.split('\n') 164 | if req.strip() and not req.lower().startswith(('none', 'no new')) 165 | ] 166 | else: 167 | # Filter list items for non-requirement statements 168 | new_reqs = [ 169 | req for req in result.new_requirements 170 | if not str(req).lower().startswith(('none', 'no new')) 171 | ] 172 | 173 | if new_reqs: # Only add if we have actual requirements 174 | print("\nAdding new requirements:") 175 | for req in new_reqs: 176 | print(f"- {req}") 177 | requirements.append(req) 178 | 179 | # Process unnecessary requirements 180 | if result.unnecessary_requirements: 181 | if isinstance(result.unnecessary_requirements, str): 182 | # Split string into list items 183 | remove_reqs = [req.strip() for req in result.unnecessary_requirements.split('\n') if req.strip()] 184 | else: 185 | remove_reqs = result.unnecessary_requirements 186 | 187 | print("\nRemoving unnecessary requirements:") 188 | for req in remove_reqs: 189 | print(f"- {req}") 190 | requirements = [r for r in requirements if r not in remove_reqs] 191 | 192 | # Check if we should stop 193 | if result.action.lower().strip() == "stop": 194 | print("\nRequirements generation complete") 195 | break 196 | 197 | iteration += 1 198 | 199 | print("\nFinal Requirements:") 200 | for i, req in enumerate(requirements, 1): 201 | print(f"{i}. {req}") 202 | 203 | return requirements 204 | 205 | def track_analysis(analysis_history, analysis, confidence): 206 | """Track analysis results as a list of tuples""" 207 | try: 208 | # Extract first digit if confidence is a string 209 | if isinstance(confidence, str): 210 | confidence = ''.join(filter(str.isdigit, confidence)) or '5' 211 | confidence_int = int(confidence) 212 | # Clamp to 1-10 range 213 | confidence_int = max(1, min(10, confidence_int)) 214 | analysis_history.append((analysis, confidence_int)) 215 | except (ValueError, TypeError): 216 | # Default to medium confidence if parsing fails 217 | analysis_history.append((analysis, 5)) 218 | return analysis_history 219 | 220 | def run_reasoning_pipeline(initial_context, initial_objective, callback=None): 221 | # Generate requirements first with retry logic 222 | max_retries = 3 223 | requirements = [] 224 | 225 | for attempt in range(max_retries): 226 | requirements = generate_requirements(initial_context, initial_objective) 227 | print(f"\nFinal Requirements: {requirements}") 228 | 229 | # If we got requirements, break 230 | if requirements: 231 | break 232 | 233 | print(f"\nWarning: Empty requirements list on attempt {attempt + 1}. Retrying...") 234 | 235 | # If still empty after retries, use a default requirement 236 | if not requirements: 237 | print("\nWarning: Could not generate requirements after multiple attempts. Using default.") 238 | requirements = ["Use the given numbers and operations to achieve the objective"] 239 | 240 | # Initialize context and analysis history 241 | requirements_str = "\n".join(f"- {req}" for req in requirements) 242 | initial_context_with_reqs = f"{initial_context.strip()}\n\nRequirements:\n{requirements_str}" 243 | context_history = [initial_context_with_reqs] 244 | analysis_history = [] 245 | 246 | # Extract question and hint if they exist 247 | context_lines = initial_context.split('\n') 248 | question = next((line for line in context_lines if line.startswith("Final Question:")), initial_context) 249 | hint = next((line for line in context_lines if line.startswith("Hint:")), "") 250 | 251 | # Create display context for debugging 252 | display_context = f"{question}\n{hint}\n\nRequirements:\n{requirements_str}" if hint else f"{question}\n\nRequirements:\n{requirements_str}" 253 | objective = initial_objective 254 | iteration = 1 255 | 256 | while True: 257 | print(f"\n--- Reasoning Iteration {iteration} ---") 258 | print(f"Context: {display_context}") 259 | print(f"Objective: {objective}") 260 | 261 | # Get current context from history 262 | current_context = "\n\n".join(context_history) 263 | 264 | # Run the reasoning pipeline 265 | result = reasoning_pipeline(context=current_context, objective=objective) 266 | 267 | # Track analysis and call callback if provided 268 | analysis_history = track_analysis(analysis_history, 269 | result.objective_achieved_analysis, 270 | result.objective_achieved_confidence) 271 | 272 | if callback: 273 | callback(iteration, current_context, objective, result) 274 | 275 | # Validate and process the action 276 | action = result.action.lower().strip() 277 | print("Reasoning Process:", result.reasoning) 278 | print("Reasoning Output:", result.reasoning_output) 279 | print("\nDetailed Informal Proof Steps:") 280 | if isinstance(result.informal_proof, str): 281 | # Convert string proof to list if needed 282 | proof_steps = [step.strip() for step in result.informal_proof.split('\n') if step.strip()] 283 | else: 284 | proof_steps = result.informal_proof 285 | 286 | for i, step in enumerate(proof_steps, 1): 287 | print(f"{i}. {step}") 288 | 289 | print("\nProof Line Analysis:") 290 | print(result.proof_line_analysis) 291 | print("\nObjective Achievement Analysis:") 292 | print(f"{result.objective_achieved_analysis} (Confidence: {result.objective_achieved_confidence}/10)") 293 | print("\nAnalysis History:") 294 | for i, (analysis, confidence) in enumerate(analysis_history, 1): 295 | print(f"Iteration {i}: {analysis} (Confidence: {confidence}/10)") 296 | 297 | print("action:", action) 298 | 299 | # Only accept termination if explicitly told to 300 | if "terminate" in action or "no further" in action: 301 | if result.is_valid_reasoning.lower().strip() in ["true", "yes", "correct"]: 302 | print("Decision: Terminate reasoning process with valid solution") 303 | break 304 | else: 305 | print("Decision: Invalid solution found - continuing reasoning") 306 | objective = "The previous solution was mathematically incorrect. Try a different approach." 307 | continue 308 | 309 | print("Decision: Continue reasoning") 310 | 311 | # Update context history with full reasoning details 312 | context_history.append(f""" 313 | --- Reasoning Iteration {iteration} --- 314 | Context: {display_context} 315 | Objective: {objective} 316 | Reasoning Process: {result.reasoning} 317 | Reasoning Output: {result.reasoning_output} 318 | Objective Analysis: {result.objective_achieved_analysis} (Confidence: {result.objective_achieved_confidence}/10) 319 | """.strip()) 320 | 321 | # Update context for next iteration with full history 322 | context = "\n\n".join(context_history) 323 | objective = "Continue reasoning based on previous analysis" 324 | iteration += 1 325 | 326 | # Example usage 327 | initial_context = """ 328 | How can you solve the Game of 24 using the numbers 3, 329 | 4, 5, and 6? 330 | Let's think step by step: 331 | 1. We need to use basic arithmetic operations (+, -, *, /) to 332 | get 24. 333 | 2. One possible solution is: (3 * 4) + (5 + 6) = 24.""" 334 | initial_objective = "Generate a new solution using the same numbers." 335 | 336 | if __name__ == "__main__": 337 | run_reasoning_pipeline(initial_context, initial_objective) 338 | -------------------------------------------------------------------------------- /researcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import dspy 3 | from typing import List, Dict, Optional 4 | from serper_search import SerperSearch 5 | 6 | class DecideNextActionSignature(dspy.Signature): 7 | """Decide the next action to take based on current information""" 8 | search_results = dspy.InputField(desc="All search results from previous searches") 9 | current_text = dspy.InputField(desc="The current text being worked on") 10 | downloaded_sites = dspy.InputField(desc="All previously downloaded website contents") 11 | reasoning = dspy.OutputField(desc="Reasoning for the chosen action") 12 | action = dspy.OutputField(desc="Next action to take: 'search', 'rewrite', or 'download'") 13 | action_reasoning = dspy.OutputField(desc="Reasoning for the chosen action") 14 | 15 | class RewriteTextSignature(dspy.Signature): 16 | """Rewrite the current text using all available information""" 17 | all_texts = dspy.InputField(desc="All texts including search results and downloaded content") 18 | current_text = dspy.InputField(desc="The current text being rewritten") 19 | reasoning = dspy.OutputField(desc="Reasoning for the rewrite") 20 | rewritten_text = dspy.OutputField(desc="The new rewritten text") 21 | rewrite_reasoning = dspy.OutputField(desc="Explanation of changes made") 22 | 23 | class EvaluateTextSignature(dspy.Signature): 24 | """Evaluate the quality of the rewritten text""" 25 | original_text = dspy.InputField(desc="The original text before rewriting") 26 | rewritten_text = dspy.InputField(desc="The rewritten text to evaluate") 27 | evaluation_reasoning = dspy.OutputField(desc="Detailed reasoning for the evaluation score") 28 | evaluation = dspy.OutputField(desc="Evaluation of text quality on a scale from 1-10") 29 | improvement_suggestions = dspy.OutputField(desc="Suggestions for further improving the text") 30 | 31 | class GenerateSearchQuerySignature(dspy.Signature): 32 | """Generate an effective search query based on research needs""" 33 | current_text = dspy.InputField(desc="The current text being researched") 34 | research_goal = dspy.InputField(desc="The overall goal of the research") 35 | search_results = dspy.InputField(desc="Previous search results", default="") 36 | reasoning = dspy.OutputField(desc="Reasoning for the search query") 37 | search_query = dspy.OutputField(desc="The search query to use") 38 | query_type = dspy.OutputField( 39 | desc="Type of query: 'general' for broad searches, 'specific' for focused searches", 40 | default="general" 41 | ) 42 | 43 | class Researcher(dspy.Module): 44 | def __init__(self, max_iterations: int = 10, max_searches: int = 3): 45 | super().__init__() 46 | self.forward = self.run_research # Map forward to run_research 47 | 48 | # Configure DeepSeek as the language model with higher temperature for more creativity 49 | self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.5, cache=False) 50 | dspy.settings.configure(lm=self.lm) 51 | 52 | # Initialize search client 53 | self.search_client = SerperSearch() 54 | 55 | self.max_iterations = max_iterations 56 | self.max_searches = max_searches 57 | self.search_count = 0 58 | self.research_goal = "" 59 | 60 | # Initialize the DSPy modules 61 | self.decide_action = dspy.ChainOfThought(DecideNextActionSignature) 62 | self.rewrite_text = dspy.ChainOfThought(RewriteTextSignature) 63 | self.evaluate_text = dspy.ChainOfThought(EvaluateTextSignature) 64 | self.generate_search_query = dspy.ChainOfThought(GenerateSearchQuerySignature) 65 | 66 | # State tracking 67 | self.search_results = [] 68 | self.downloaded_sites = [] 69 | self.all_texts = [] 70 | self.current_text = "" 71 | self.evaluation_history = [] 72 | 73 | def add_search_results(self, results: List[Dict]): 74 | """Add new search results to the researcher's knowledge""" 75 | self.search_results.extend(results) 76 | self.all_texts.extend([r['snippet'] for r in results]) 77 | 78 | def add_downloaded_site(self, content: str): 79 | """Add downloaded website content to the researcher's knowledge""" 80 | self.downloaded_sites.append(content) 81 | self.all_texts.append(content) 82 | 83 | def decide_next_action(self) -> str: 84 | """Determine the next action to take""" 85 | if self.search_count >= self.max_searches: 86 | return 'rewrite' 87 | 88 | result = self.decide_action( 89 | search_results=self.search_results, 90 | current_text=self.current_text, 91 | downloaded_sites=self.downloaded_sites 92 | ) 93 | return result.action.lower() 94 | 95 | def rewrite_current_text(self) -> str: 96 | """Rewrite the current text using all available information""" 97 | result = self.rewrite_text( 98 | all_texts=self.all_texts, 99 | current_text=self.current_text 100 | ) 101 | return result.rewritten_text 102 | 103 | def evaluate_current_text(self) -> Dict: 104 | """Evaluate the quality of the current text""" 105 | if not self.current_text: 106 | return { 107 | 'evaluation': 0, 108 | 'evaluation_reasoning': 'No text to evaluate', 109 | 'improvement_suggestions': 'Start with initial text' 110 | } 111 | 112 | result = self.evaluate_text( 113 | original_text=self.all_texts[0] if self.all_texts else "", 114 | rewritten_text=self.current_text 115 | ) 116 | try: 117 | # Handle different evaluation score formats 118 | if isinstance(result.evaluation, str): 119 | # Try to extract number from string 120 | import re 121 | numbers = re.findall(r'\d+', result.evaluation) 122 | if numbers: 123 | evaluation_score = float(numbers[0]) 124 | else: 125 | evaluation_score = 1.0 126 | else: 127 | evaluation_score = float(result.evaluation) 128 | 129 | # Clamp score between 1-10 and round to nearest integer 130 | evaluation_score = max(1.0, min(10.0, evaluation_score)) 131 | evaluation_score = round(evaluation_score) 132 | 133 | return { 134 | 'evaluation': evaluation_score, 135 | 'evaluation_reasoning': result.evaluation_reasoning, 136 | 'improvement_suggestions': result.improvement_suggestions 137 | } 138 | except (ValueError, TypeError): 139 | # Default to low score if conversion fails 140 | return { 141 | 'evaluation': 1, 142 | 'evaluation_reasoning': "Invalid evaluation score format", 143 | 'improvement_suggestions': "Ensure evaluation returns a valid number between 1-10" 144 | } 145 | 146 | def generate_search_terms(self) -> str: 147 | """Generate effective search terms based on current research state""" 148 | result = self.generate_search_query( 149 | current_text=self.current_text, 150 | research_goal=self.research_goal, 151 | search_results=self.search_results 152 | ) 153 | return result.search_query 154 | 155 | def run_research(self, initial_text: str) -> Dict: 156 | """Run the research process with iteration control""" 157 | if not initial_text: 158 | raise ValueError("Initial text cannot be empty") 159 | 160 | self.current_text = initial_text 161 | self.all_texts = [initial_text] 162 | self.research_goal = initial_text # Use initial text as research goal 163 | 164 | for iteration in range(self.max_iterations): 165 | print(f"\n--- Research Iteration {iteration + 1} ---") 166 | 167 | # Decide next action 168 | action = self.decide_next_action() 169 | print(f"Action: {action}") 170 | 171 | if action == 'search': 172 | if self.search_count >= self.max_searches: 173 | print("Max searches reached, switching to rewrite") 174 | action = 'rewrite' 175 | else: 176 | self.search_count += 1 177 | # Generate optimized search terms 178 | search_term = self.generate_search_terms() 179 | print(f"Performing search for: {search_term}...") 180 | 181 | try: 182 | # Perform actual search with error handling 183 | search_results = self.search_client.search(search_term) 184 | if search_results: 185 | self.add_search_results(search_results) 186 | else: 187 | print("Warning: No search results found") 188 | except Exception as e: 189 | print(f"Search error: {str(e)}") 190 | continue 191 | 192 | continue 193 | 194 | elif action == 'download': 195 | # Note: Actual download implementation would go here 196 | print("Downloading site...") 197 | continue 198 | 199 | elif action == 'rewrite': 200 | # Rewrite the text 201 | new_text = self.rewrite_current_text() 202 | print("\nRewritten Text:") 203 | print(new_text) 204 | 205 | # Evaluate the new text 206 | evaluation = self.evaluate_current_text() 207 | print("\nEvaluation:") 208 | print(f"Score: {evaluation['evaluation']}/10") 209 | print(f"Reasoning: {evaluation['evaluation_reasoning']}") 210 | print(f"Suggestions: {evaluation['improvement_suggestions']}") 211 | 212 | # Update state 213 | self.current_text = new_text 214 | self.all_texts.append(new_text) 215 | self.evaluation_history.append(evaluation) 216 | 217 | # Check if we should terminate 218 | if evaluation['evaluation'] >= 9: 219 | print("\nHigh quality text achieved, stopping research") 220 | break 221 | 222 | else: 223 | print(f"Unknown action: {action}, defaulting to rewrite") 224 | action = 'rewrite' 225 | 226 | return { 227 | 'final_text': self.current_text, 228 | 'evaluation_history': self.evaluation_history, 229 | 'search_count': self.search_count, 230 | 'iterations': iteration + 1 231 | } 232 | 233 | if __name__ == "__main__": 234 | # Example usage 235 | initial_text = "Write a comprehensive overview of recent developments in AI research" 236 | 237 | researcher = Researcher(max_iterations=10, max_searches=3) 238 | result = researcher.run_research(initial_text) 239 | 240 | print("\nFinal Result:") 241 | print(result['final_text']) 242 | print("\nEvaluation History:") 243 | for i, eval in enumerate(result['evaluation_history'], 1): 244 | print(f"Iteration {i}: Score {eval['evaluation']}/10") 245 | -------------------------------------------------------------------------------- /researcher_optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import dspy 3 | from typing import List, Dict 4 | from researcher import Researcher 5 | from dspy.teleprompt import MIPROv2 6 | from dspy import Example 7 | 8 | # Dataset of task prompts for optimization 9 | RESEARCH_TASKS = [ 10 | { 11 | "input": "Write a comprehensive overview of recent developments in AI research", 12 | }, 13 | { 14 | "input": "Explain the latest breakthroughs in quantum computing", 15 | }, 16 | { 17 | "input": "Compare different approaches to climate change mitigation", 18 | }, 19 | { 20 | "input": "Analyze the impact of social media on mental health", 21 | }, 22 | { 23 | "input": "Describe the evolution of renewable energy technologies", 24 | }, 25 | { 26 | "input": "Evaluate the effectiveness of different education systems worldwide", 27 | }, 28 | { 29 | "input": "Explain the causes and effects of inflation in modern economies", 30 | }, 31 | { 32 | "input": "Discuss the future of space exploration", 33 | }, 34 | { 35 | "input": "Analyze the role of AI in healthcare diagnostics", 36 | }, 37 | { 38 | "input": "Compare traditional and modern architectural styles", 39 | } 40 | ] 41 | 42 | 43 | def create_dataset() -> List[Example]: 44 | """Create dataset from predefined research tasks with validation""" 45 | dataset = [] 46 | for task in RESEARCH_TASKS: 47 | example = Example(input=task["input"]).with_inputs('input') 48 | dataset.append(example) 49 | return dataset 50 | 51 | class ResearcherOptimizer: 52 | def __init__(self, max_iterations: int = 10, max_searches: int = 3): 53 | self.max_iterations = max_iterations 54 | self.max_searches = max_searches 55 | self.dataset = create_dataset() 56 | 57 | # Configure DeepSeek as the language model 58 | self.lm = dspy.LM(model="deepseek/deepseek-chat", temperature=1.0, cache=False) 59 | dspy.settings.configure(lm=self.lm) 60 | 61 | def evaluate_researcher(self, researcher: Researcher, example: Example) -> float: 62 | """Evaluate researcher performance on a single example""" 63 | result = researcher(example.input) 64 | final_text = result['final_text'] 65 | 66 | # Simple evaluation metric (could be enhanced) 67 | score = self._calculate_similarity(final_text, example.output) 68 | return score 69 | 70 | def _calculate_similarity(self, text1: str, text2: str) -> float: 71 | """Improved text similarity metric using TF-IDF cosine similarity""" 72 | from sklearn.feature_extraction.text import TfidfVectorizer 73 | from sklearn.metrics.pairwise import cosine_similarity 74 | 75 | # Handle empty text cases 76 | if not text1 or not text2: 77 | return 0.0 78 | 79 | # Create TF-IDF vectors 80 | vectorizer = TfidfVectorizer() 81 | tfidf_matrix = vectorizer.fit_transform([text1, text2]) 82 | 83 | # Calculate cosine similarity 84 | similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] 85 | return float(similarity) 86 | 87 | def optimize(self, num_candidates: int = 5, num_iterations: int = 3) -> Researcher: 88 | """Optimize the researcher using MIPRO""" 89 | # Define the teleprompter with MIPROv2 90 | teleprompter = MIPROv2( 91 | metric=self.evaluate_researcher, 92 | num_candidates=num_candidates, 93 | num_threads=1, # MIPROv2 uses internal parallelization 94 | teacher_settings=dict(lm=self.lm), 95 | init_temperature=1.0, 96 | prompt_model=self.lm, 97 | task_model=self.lm, 98 | auto='medium', 99 | track_stats=True 100 | ) 101 | 102 | # Create initial researcher 103 | base_researcher = Researcher( 104 | max_iterations=self.max_iterations, 105 | max_searches=self.max_searches 106 | ) 107 | 108 | # Run optimization 109 | optimized_researcher = teleprompter.compile( 110 | base_researcher, 111 | trainset=self.dataset, 112 | num_trials=num_iterations, 113 | requires_permission_to_run=False # Disable confirmation prompt 114 | ) 115 | 116 | return optimized_researcher 117 | 118 | if __name__ == "__main__": 119 | optimizer = ResearcherOptimizer() 120 | 121 | print("Starting researcher optimization...") 122 | optimized_researcher = optimizer.optimize() 123 | 124 | print("\nOptimization complete. Testing optimized researcher:") 125 | test_task = RESEARCH_TASKS[0] 126 | result = optimized_researcher.run_research(test_task["input"]) 127 | 128 | print("\nTest Task Input:", test_task["input"]) 129 | print("\nGenerated Output:", result['final_text']) 130 | print("\nEvaluation History:") 131 | for i, eval in enumerate(result['evaluation_history'], 1): 132 | print(f"Iteration {i}: Score {eval['evaluation']}/10") 133 | -------------------------------------------------------------------------------- /residual_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dspy 4 | import json 5 | import time 6 | from typing import List, Optional 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | from tqdm import tqdm 9 | 10 | class SearchReplaceModule(dspy.Module): 11 | def __init__(self): 12 | super().__init__() 13 | self.process = dspy.ChainOfThought('input -> search_block, replace_block') 14 | 15 | def forward(self, input_text: str) -> str: 16 | result = self.process(input=input_text) 17 | if not hasattr(result, 'search_block') or not hasattr(result, 'replace_block'): 18 | return input_text 19 | return input_text.replace(result.search_block, result.replace_block) 20 | 21 | class SearchReplacePipeline(dspy.Module): 22 | def __init__(self, num_layers: int = 3): 23 | super().__init__() 24 | self.layers = [SearchReplaceModule() for _ in range(num_layers)] 25 | 26 | def forward(self, task: str) -> str: 27 | current = task 28 | for layer in self.layers: 29 | current = layer(current) 30 | return current 31 | 32 | 33 | 34 | class SearchReplaceIterModule(dspy.Module): 35 | def __init__(self): 36 | super().__init__() 37 | self.process = dspy.ChainOfThought('input, iteration -> search_block, replace_block') 38 | 39 | def forward(self, input_text: str, iteration: int) -> str: 40 | result = self.process(input=input_text, iteration=iteration) 41 | if not hasattr(result, 'search_block') or not hasattr(result, 'replace_block'): 42 | return input_text 43 | return input_text.replace(result.search_block, result.replace_block) 44 | 45 | class SearchReplaceIterPipeline(dspy.Module): 46 | def __init__(self, num_iters: int = 10): 47 | super().__init__() 48 | self.layers = [SearchReplaceIterModule() for _ in range(num_iters)] 49 | 50 | def forward(self, task: str) -> str: 51 | current = task 52 | # for layer in self.layers: 53 | for iteration, layer in enumerate(self.layers): 54 | # current = layer(current) 55 | current = layer(current, iteration) 56 | return current 57 | 58 | 59 | 60 | 61 | 62 | 63 | # try: 64 | # # Try to evaluate the final expression 65 | # return str(eval(current)) 66 | # except: 67 | # return current 68 | 69 | def evaluate_pipeline( 70 | dataset_path: str = "math_dataset.json", 71 | num_threads: int = 10, 72 | num_layers: int = 10, 73 | model: str = "deepseek/deepseek-chat", 74 | temperature: float = 0.3 75 | ) -> float: 76 | print(f"\nEvaluating SearchReplace Pipeline with {num_layers} layers using {model}...") 77 | start_time = time.time() 78 | 79 | with open(dataset_path) as f: 80 | dataset = json.load(f) 81 | 82 | lm = dspy.LM(model=model, temperature=temperature, cache=False) 83 | dspy.settings.configure(lm=lm) 84 | pipeline = SearchReplacePipeline(num_layers=num_layers) 85 | 86 | correct = 0 87 | total_tasks = min(len(dataset), 100) 88 | results = [] 89 | 90 | def evaluate_task(task_data): 91 | try: 92 | task = task_data['task'] 93 | expected = float(task_data['solution']) 94 | 95 | predicted = pipeline(task) 96 | predicted_num = float(predicted) 97 | 98 | is_correct = abs(predicted_num - expected) < 0.01 99 | return { 100 | 'task': task, 101 | 'predicted': predicted, 102 | 'expected': expected, 103 | 'correct': is_correct 104 | } 105 | except (ValueError, TypeError) as e: 106 | return { 107 | 'task': task_data['task'], 108 | 'error': str(e), 109 | 'correct': False 110 | } 111 | 112 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 113 | futures = [ 114 | executor.submit(evaluate_task, task_data) 115 | for task_data in dataset[:total_tasks] 116 | ] 117 | 118 | with tqdm(total=total_tasks, desc="Evaluating") as pbar: 119 | for future in as_completed(futures): 120 | result = future.result() 121 | results.append(result) 122 | if result.get('correct', False): 123 | correct += 1 124 | pbar.update(1) 125 | 126 | # Display running accuracy 127 | current_accuracy = correct / len(results) 128 | pbar.set_postfix({'accuracy': f'{current_accuracy:.1%}'}) 129 | 130 | accuracy = correct / total_tasks 131 | elapsed = time.time() - start_time 132 | 133 | print("\nEvaluation Results:") 134 | print(f"Accuracy: {accuracy:.1%}") 135 | print(f"Time taken: {elapsed:.1f}s") 136 | print(f"Tasks evaluated: {total_tasks}") 137 | 138 | # Display some example predictions 139 | print("\nExample predictions:") 140 | for i, result in enumerate(results[:5]): 141 | print(f"\nTask {i+1}:") 142 | print(f"Input: {result['task']}") 143 | if 'error' in result: 144 | print(f"Error: {result['error']}") 145 | else: 146 | print(f"Predicted: {result['predicted']}") 147 | print(f"Expected: {result['expected']}") 148 | print(f"Correct: {result['correct']}") 149 | 150 | return accuracy 151 | 152 | if __name__ == "__main__": 153 | evaluate_pipeline(num_layers=3) 154 | -------------------------------------------------------------------------------- /residual_pipeline_optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import dspy 4 | import json 5 | import time 6 | import numpy as np 7 | from typing import Dict, List, Tuple 8 | from concurrent.futures import ThreadPoolExecutor 9 | from tqdm import tqdm 10 | from residual_pipeline import SearchReplacePipeline, evaluate_pipeline 11 | from residual_pipeline import SearchReplaceIterPipeline 12 | 13 | PIPELINE_TYPE_STANDARD = "standard" 14 | PIPELINE_TYPE_ITER = "iter" 15 | 16 | class PipelineOptimizer: 17 | def __init__(self, pipeline_type: str = PIPELINE_TYPE_STANDARD): 18 | self.best_config = None 19 | self.best_accuracy = 0.0 20 | self.results_history = [] 21 | self.pipeline_type = pipeline_type 22 | self.dataset_path = "math_dataset.json" 23 | 24 | def _create_teleprompter(self, metric, optimizer_type: str = "bfs"): 25 | """Create and configure teleprompter""" 26 | config = self._get_default_config() 27 | if optimizer_type == "mipro": 28 | return dspy.teleprompt.MIPROv2( 29 | metric=metric, 30 | num_candidates=config['num_candidates'], 31 | num_threads=config['num_threads'], 32 | max_bootstrapped_demos=config['max_bootstrapped_demos'], 33 | max_labeled_demos=config['max_labeled_demos'], 34 | auto='light' 35 | ) 36 | else: # Default to BootstrapFewShot 37 | return dspy.teleprompt.BootstrapFewShot( 38 | metric=metric, 39 | max_bootstrapped_demos=config['max_bootstrapped_demos'], 40 | max_labeled_demos=config['max_labeled_demos'] 41 | ) 42 | 43 | def bootstrap_dataset(self, dataset: List[Dict], num_bootstrap: int = 5) -> List[Dict]: 44 | indices = np.random.choice(len(dataset), size=num_bootstrap, replace=True) 45 | return [dataset[i] for i in indices] 46 | 47 | def _create_pipeline(self, config): 48 | """Create appropriate pipeline based on configured type""" 49 | if self.pipeline_type == PIPELINE_TYPE_ITER: 50 | return SearchReplaceIterPipeline(num_iters=config['num_layers']) 51 | return SearchReplacePipeline(num_layers=config['num_layers']) 52 | 53 | def _evaluate_pipeline(self, config, dataset_path, num_threads): 54 | """Evaluate pipeline with given configuration""" 55 | return evaluate_pipeline( 56 | dataset_path=dataset_path, 57 | num_layers=config['num_layers'], 58 | num_threads=num_threads, 59 | model=config['model'], 60 | temperature=config['temperature'] 61 | ) 62 | 63 | def _get_default_config(self) -> Dict: 64 | """Get default configuration for optimization""" 65 | return { 66 | 'num_layers': 10, 67 | 'temperature': 1.0, 68 | 'model': "deepseek/deepseek-chat", 69 | 'num_threads': 10, 70 | 'num_candidates': 3, 71 | 'max_bootstrapped_demos': 3, 72 | 'max_labeled_demos': 4 73 | } 74 | 75 | def _load_dataset(self, dataset_path: str) -> List[Dict]: 76 | """Load dataset from JSON file""" 77 | with open(dataset_path) as f: 78 | return json.load(f) 79 | 80 | def _create_trainset(self, dataset: List[Dict]) -> List[dspy.Example]: 81 | """Create training set from dataset""" 82 | trainset = [] 83 | # for item in dataset[:100]: # Use first 100 examples for training 84 | # random sample import 85 | # for item_i in range(100): 86 | from random import sample 87 | sample_dataset = sample(dataset, 100) 88 | for item in sample_dataset: 89 | trainset.append(dspy.Example( 90 | task=item['task'], 91 | solution=item['solution'] 92 | ).with_inputs('task')) 93 | return trainset 94 | 95 | def _configure_model(self, config: Dict) -> None: 96 | """Configure DSPy language model""" 97 | lm = dspy.LM( 98 | model=config['model'], 99 | temperature=config['temperature'], 100 | cache=False 101 | ) 102 | dspy.settings.configure(lm=lm) 103 | 104 | def _create_fewshot_examples(self, trainset: List[dspy.Example]) -> List[dspy.Example]: 105 | """Create few-shot examples from training set""" 106 | fewshot_examples = [] 107 | for example in trainset[:5]: # Use first 5 examples for few-shot 108 | fewshot_examples.append(dspy.Example( 109 | task=example.task, 110 | solution=example.solution 111 | ).with_inputs('task')) 112 | return fewshot_examples 113 | 114 | def optimize(self) -> Dict: 115 | 116 | print("\nStarting Pipeline Optimization...") 117 | start_time = time.time() 118 | 119 | config = self._get_default_config() 120 | 121 | full_dataset = self._load_dataset(self.dataset_path) 122 | 123 | # Use BFS as default optimizer 124 | optimizer_type = "bfs" 125 | print(f"\nUsing {optimizer_type.upper()} optimizer...") 126 | self._configure_model(config) 127 | 128 | # Define metric function 129 | def metric(example, prediction, trace=None): 130 | try: 131 | pred = float(prediction.solution) 132 | exp = float(example.solution) 133 | return int(abs(pred - exp) < 0.01) 134 | except: 135 | return 0 136 | 137 | teacher = None 138 | best_accuracy = 0.0 139 | best_pipeline = None 140 | 141 | num_iterations = 3 142 | for iteration in range(num_iterations): 143 | print(f"\nBFS Iteration {iteration + 1}/{num_iterations}") 144 | 145 | teleprompter = self._create_teleprompter(metric, optimizer_type) 146 | 147 | # Create new student pipeline 148 | student = self._create_pipeline(config) 149 | 150 | trainset = self._create_trainset(full_dataset) 151 | # Compile with current teacher 152 | optimized_pipeline = teleprompter.compile( 153 | student, 154 | trainset=trainset, 155 | teacher=teacher 156 | ) 157 | 158 | # Evaluate the optimized pipeline 159 | accuracy = self._evaluate_pipeline(config, self.dataset_path, config['num_threads']) 160 | 161 | # Update best pipeline if this one is better 162 | if accuracy > best_accuracy: 163 | best_accuracy = accuracy 164 | print(f"New best accuracy: {accuracy:.1%}") 165 | best_pipeline = optimized_pipeline 166 | 167 | # Set current optimized pipeline as teacher for next iteration 168 | teacher = optimized_pipeline 169 | print("Teacher updated") 170 | 171 | 172 | print(f"Iteration {iteration + 1} accuracy: {accuracy:.1%}") 173 | 174 | accuracy = best_accuracy 175 | 176 | result = { 177 | **config, 178 | 'accuracy': accuracy, 179 | 'timestamp': time.time() 180 | } 181 | self.results_history.append(result) 182 | 183 | # Update best accuracy 184 | if accuracy > self.best_accuracy: 185 | self.best_accuracy = accuracy 186 | self.best_config = config 187 | else: 188 | # Just evaluate baseline pipeline 189 | accuracy = evaluate_pipeline( 190 | dataset_path=self.dataset_path, 191 | num_layers=config['num_layers'], 192 | num_threads=config['num_threads'], 193 | model=config['model'], 194 | temperature=config['temperature'] 195 | ) 196 | 197 | result = { 198 | **config, 199 | 'accuracy': accuracy, 200 | 'timestamp': time.time() 201 | } 202 | self.results_history.append(result) 203 | 204 | # Update best accuracy 205 | if accuracy > self.best_accuracy: 206 | self.best_accuracy = accuracy 207 | self.best_config = config 208 | 209 | elapsed = time.time() - start_time 210 | 211 | # Print optimization results 212 | print("\nOptimization Results:") 213 | print(f"Time taken: {elapsed:.1f}s") 214 | print(f"Bootstrap iterations completed: {len(self.results_history)}") 215 | print(f"\nBest Configuration:") 216 | print(f"Number of layers: {self.best_config['num_layers']}") 217 | print(f"Temperature: {self.best_config['temperature']}") 218 | print(f"Accuracy: {self.best_accuracy:.1%}") 219 | 220 | # Print performance progression if we have results 221 | if self.results_history: 222 | print("\nPerformance History:") 223 | for result in sorted(self.results_history, 224 | key=lambda x: x['accuracy'], 225 | reverse=True)[:5]: 226 | print(f"\nLayers: {result['num_layers']}, " 227 | f"Temp: {result['temperature']:.1f}, " 228 | f"Accuracy: {result['accuracy']:.1%}") 229 | 230 | return self.best_config 231 | 232 | import argparse 233 | 234 | def parse_args(): 235 | """Parse command line arguments""" 236 | parser = argparse.ArgumentParser(description='Optimize residual pipeline') 237 | parser.add_argument('--pipeline-type', type=str, default=PIPELINE_TYPE_STANDARD, 238 | choices=[PIPELINE_TYPE_STANDARD, PIPELINE_TYPE_ITER], 239 | help='Type of pipeline to optimize') 240 | parser.add_argument('--optimizer', type=str, default="bfs", 241 | choices=["bfs", "mipro"], 242 | help='Optimizer to use (bfs=BootstrapFewShot, mipro=MIPROv2)') 243 | parser.add_argument('--dataset', type=str, default="math_dataset.json", 244 | help='Path to dataset file') 245 | parser.add_argument('--threads', type=int, default=10, 246 | help='Number of threads to use') 247 | parser.add_argument('--iterations', type=int, default=3, 248 | help='Number of BFS iterations to run') 249 | return parser.parse_args() 250 | 251 | def main(): 252 | args = parse_args() 253 | 254 | print(f"\nOptimizing {args.pipeline_type} pipeline...") 255 | print(f"Optimizer: {args.optimizer.upper()}") 256 | print(f"Dataset: {args.dataset}") 257 | print(f"Threads: {args.threads}\n") 258 | 259 | optimizer = PipelineOptimizer(pipeline_type=args.pipeline_type) 260 | baseline_config = optimizer.optimize( 261 | dataset_path=args.dataset, 262 | num_threads=args.threads, 263 | optimizer_type=args.optimizer, 264 | num_iterations=args.iterations 265 | ) 266 | 267 | 268 | if __name__ == "__main__": 269 | main() 270 | -------------------------------------------------------------------------------- /serper_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import requests 5 | from typing import List, Dict, Optional 6 | 7 | from pprint import pprint 8 | 9 | def main(): 10 | if len(sys.argv) < 2: 11 | print("Usage: serper_search.py