├── .gitignore ├── CITATION.cff ├── LICENSE ├── Makefile ├── README.md ├── base_agent ├── .gitignore ├── README.md ├── __main__.py ├── agent.py ├── agent_change_log.md ├── conftest.py ├── description.txt ├── pytest.ini ├── requirements.txt ├── src │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── agent_calling.py │ │ ├── assistant_base_agent.py │ │ ├── base_agent.py │ │ └── implementations │ │ │ ├── __init__.py │ │ │ ├── archive_explorer.py │ │ │ ├── coder.py │ │ │ ├── main_orchestrator.py │ │ │ ├── problem_solver.py │ │ │ ├── reasoner.py │ │ │ └── review_committee_member.py │ ├── benchmarks │ │ ├── __init__.py │ │ ├── aime.py │ │ ├── aiq_benchmark.py │ │ ├── aiq_project_benchmarks.py │ │ ├── arc_agi.py │ │ ├── base.py │ │ ├── drop.py │ │ ├── file_editing.py │ │ ├── gpqa.py │ │ ├── gsm8k.py │ │ ├── gsm_ic.py │ │ ├── humaneval.py │ │ ├── livecodebench.py │ │ ├── math.py │ │ ├── refute.py │ │ ├── swebench_verified.py │ │ └── symbol_location.py │ ├── callgraph │ │ ├── __init__.py │ │ ├── digraph.py │ │ ├── manager.py │ │ └── reporting.py │ ├── config.py │ ├── events │ │ ├── __init__.py │ │ ├── event_bus.py │ │ └── event_bus_utils.py │ ├── llm │ │ ├── __init__.py │ │ ├── api.py │ │ ├── base.py │ │ ├── metering.py │ │ └── providers │ │ │ ├── __init__.py │ │ │ ├── anthropic.py │ │ │ ├── base_provider.py │ │ │ ├── deepseek.py │ │ │ ├── fireworks.py │ │ │ ├── google.py │ │ │ ├── google_oai.py │ │ │ ├── google_rest.py │ │ │ ├── openai.py │ │ │ └── vertex.py │ ├── oversight │ │ ├── graph_visualisation.py │ │ └── overseer.py │ ├── schemas │ │ ├── __init__.py │ │ ├── json_parsing.py │ │ ├── representation.py │ │ ├── xml_dumps.py │ │ └── xml_parsing.py │ ├── tools │ │ ├── __init__.py │ │ ├── answer_submission.py │ │ ├── archive_tools.py │ │ ├── base_agent_tools.py │ │ ├── base_tool.py │ │ ├── calculator.py │ │ ├── committee_design.py │ │ ├── directory_tools.py │ │ ├── edit_tools │ │ │ ├── __init__.py │ │ │ ├── overwrite_file.py │ │ │ └── utils.py │ │ ├── execute_command.py │ │ ├── file_tools.py │ │ ├── reasoning_structures │ │ │ ├── __init__.py │ │ │ ├── coding.py │ │ │ ├── meta_improvement.py │ │ │ ├── sequential.py │ │ │ └── sequential_subagents.py │ │ └── ripgrep_tool.py │ ├── types │ │ ├── __init__.py │ │ ├── agent_types.py │ │ ├── common.py │ │ ├── event_types.py │ │ ├── llm_types.py │ │ └── tool_types.py │ ├── utils │ │ ├── __init__.py │ │ ├── archive_analysis.py │ │ ├── documentation.py │ │ ├── file_views.py │ │ ├── metrics.py │ │ ├── parsing.py │ │ └── stop_tokens.py │ └── web_server │ │ ├── __init__.py │ │ ├── server.py │ │ ├── static │ │ ├── components │ │ │ ├── event-stream.js │ │ │ ├── execution-tree.js │ │ │ └── metrics-display.js │ │ ├── core.js │ │ ├── store.js │ │ ├── styles.css │ │ ├── utils │ │ │ ├── event-utils.js │ │ │ └── formatters.js │ │ └── visualizer.js │ │ └── templates │ │ └── index.html └── tests │ ├── agents │ └── test_agent_calling.py │ ├── benchmarks │ ├── disabled_test_livecode_benchmark.py │ ├── test_benchmark_base.py │ ├── test_file_editing.py │ ├── test_gsm8k_benchmark.py │ └── test_refute_benchmark.py │ ├── events │ └── test_event_bus.py │ ├── test_example.py │ ├── tools │ ├── reasoning_structures │ │ └── test_sequential.py │ ├── test_base_tool.py │ ├── test_calculator.py │ └── test_execute_command.py │ └── utils │ ├── test_archive_analysis.py │ └── test_parsing.py ├── benchmark_data └── .gitkeep ├── figures ├── agent_execution.png └── agent_loop.png ├── results └── interactive_output │ └── agent_outputs │ └── .gitkeep ├── runner.py ├── sandbox ├── Dockerfile ├── GOOGLE_APPLICATION_CREDENTIALS.json ├── base_requirements.txt └── configs │ ├── gitignore │ └── sandbox_bashrc └── scripts └── install_swebench_harness.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | **/__pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | ### VisualStudioCode 132 | .vscode/* 133 | !.vscode/settings.json 134 | !.vscode/tasks.json 135 | !.vscode/launch.json 136 | !.vscode/extensions.json 137 | *.code-workspace 138 | **/.vscode 139 | 140 | # JetBrains 141 | .idea/ 142 | 143 | # Data & Models 144 | *.h5 145 | *.tar 146 | *.tar.gz 147 | 148 | # Lightning-Hydra-Template 149 | configs/local/default.yaml 150 | /data/ 151 | /logs/ 152 | .env 153 | 154 | # Aim logging 155 | .aim 156 | 157 | # Custom files and directories 158 | third_party 159 | benchmark_data/aiq_bench 160 | benchmark_data/file_editing_bench 161 | benchmark_data/symbol_location_bench 162 | check_boilerplate.sh 163 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.1.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: Robeyns 5 | given-names: Maxime 6 | orcid: https://orcid.org/0000-0001-9802-9597 7 | - family-names: Szummer 8 | given-names: Martin 9 | - family-names: Laurence 10 | given-names: Aitchison 11 | title: "Self-Improving Coding Agent" 12 | version: 0.0.1 13 | date-released: 2025-04-12 14 | repository-code: "https://github.com/MaximeRobeyns/self_improving_coding_agent" 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2025 Maxime Robeyns 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test 2 | 3 | PWD := $(shell pwd) 4 | 5 | int: ## Interactive run; uses default shell entrypoint 6 | @echo 'Once in the container, type:' 7 | @echo 'python -m agent_code.agent -s -p ""' 8 | @echo 'Watch the agent work on localhost:8080' 9 | docker run --rm -ti \ 10 | -p 8080:8080 \ 11 | -v ${PWD}/base_agent:/home/agent/agent_code:ro \ 12 | -v ${PWD}/results/interactive_output:/home/agent/workdir:rw \ 13 | sica_sandbox 14 | 15 | test: ## Run the unit tests for the agent 16 | @pytest base_agent 17 | 18 | image: ## Docker image for x86_64 19 | @ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \ 20 | OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \ 21 | FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \ 22 | GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \ 23 | DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \ 24 | VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \ 25 | docker buildx build --build-context base_agent=./base_agent \ 26 | -f sandbox/Dockerfile \ 27 | -t sica_sandbox \ 28 | --build-arg TARGET_ARCH=x86_64 \ 29 | --build-arg ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \ 30 | --build-arg OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \ 31 | --build-arg FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \ 32 | --build-arg GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \ 33 | --build-arg DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \ 34 | --build-arg VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \ 35 | --load sandbox 36 | 37 | image-mac: ## Docker image for apple silicon 38 | @ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \ 39 | OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \ 40 | FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \ 41 | GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \ 42 | DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \ 43 | VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \ 44 | docker buildx build --build-context base_agent=./base_agent \ 45 | -f sandbox/Dockerfile \ 46 | -t sica_sandbox \ 47 | --build-arg TARGET_ARCH=aarch64 \ 48 | --build-arg ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \ 49 | --build-arg OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \ 50 | --build-arg FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \ 51 | --build-arg GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \ 52 | --build-arg DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \ 53 | --build-arg VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \ 54 | --load sandbox 55 | 56 | docs: ## Compile documentation 57 | python base_agent/utils/documentation.py base_agent > base_agent/DOCUMENTATION.md 58 | 59 | meta: ## Run the meta-agent agent directly for testing (see manual request in __main__.py) 60 | rm -rf results/meta 61 | mkdir -p results/meta/test_logs 62 | cp -r base_agent results/meta/agent_iter 63 | # Copy an existing archive so that the meta agent has something to work with 64 | cp -r results/run_1 results/meta/archive 65 | @echo localhost:8080 66 | docker run --rm -ti \ 67 | -p 8080:8080 \ 68 | -v ${PWD}/base_agent:/home/agent/meta:ro \ 69 | -v ${PWD}/results/meta/archive:/home/agent/archive:ro \ 70 | -v ${PWD}/results/meta/agent_iter:/home/agent/workdir:rw \ 71 | -v ${PWD}/results/meta/test_logs:/home/agent/meta_logdir:rw \ 72 | sica_sandbox python -m meta improve \ 73 | --workdir /home/agent/workdir \ 74 | --logdir /home/agent/meta_logdir 75 | 76 | test_meta_int: ## Interactivley test the resulting agent from the target above 77 | docker run --rm -ti \ 78 | -p 8080:8080 \ 79 | -p 8000:8000 \ 80 | -v ${PWD}/results/meta/agent_iter:/home/agent/agent_code:ro \ 81 | -v ${PWD}/results/meta/test_output:/home/agent/workdir:rw \ 82 | sica_sandbox 83 | 84 | 85 | help: 86 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

Self-Improving Coding Agent

3 |

A coding agent experiment, that works on its own codebase.

4 |

5 | Agent Loop 6 |

7 |

8 | 9 | The system operates as an iterative improvement loop: 10 | 1. evaluating the current agent version on some benchmark tasks to capture how well it does 11 | 2. storing the results in an archive 12 | 3. running the agent on its own codebase to work on an improvement 13 | 4. going back to step 1 with the updated agent code 14 | 15 | See [our workshop paper](https://openreview.net/pdf?id=rShJCyLsOr) for more details. 16 | 17 | ## Quickstart 18 | 19 | > IMPORTANT NOTE: always run the agent in the provided Docker container. Since the agent can execute shell commands, this offers some isolation from your host machine, avoiding inadvertent file system manipulation and similar risks. 20 | 21 | First, make sure you've cloned the repo 22 | ```bash 23 | git clone https://github.com/MaximeRobeyns/self_improving_coding_agent 24 | ``` 25 | 26 | Then, export some environment variables which will be made available in the 27 | docker container. The project supports inference from a number of providers to 28 | allow for experimentation across many models. You must export at least one of 29 | these in your _local_ shell, which you can do either directly or with `direnv`, 30 | `dotenv`, etc. Omitting any provider key will simply make that provider's 31 | models unavailable to the agent. 32 | 33 | ```bash 34 | export ANTHROPIC_API_KEY= # For Claude models 35 | export OPENAI_API_KEY= # For GPT 4o and reasoning models (o1, o3, etc) 36 | export GEMINI_API_KEY= # For Gemini models 37 | export VERTEX_PROJECT_ID= # For models hosted on GCP's Vertex 38 | export FIREWORKS_AI_API_KEY= # For DeepSeek / Llama hosted on fireworks 39 | export DEEPSEEK_API_KEY= # For DeepSeek direct inference (V3, R1) 40 | export MODAL_TOKEN_ID= # To allow the agent to visit webpages and read papers 41 | export MODAL_TOKEN_SECRET= # To allow the agent to visit webpages and read papers 42 | ``` 43 | For gemini, you should replace the template file in `sandbox/GOOGLE_APPLICATION_CREDENTIALS.json` with your own credentials. 44 | 45 | Once you have at least one LLM provider's API key exported, you can build the docker image. The build command is wrapped in a Makefile target for convenience: 46 | 47 | ```bash 48 | make image 49 | ``` 50 | 51 | If you are using an apple silicon machine, use this target instead: 52 | ``` 53 | make image-mac 54 | ``` 55 | 56 | Finally, install the requirements in your local python environment: 57 | ```bash 58 | # remember to activate a virtual environment or equivalent here 59 | pip install -r base_agent/requirements.txt 60 | pip install swebench 61 | ``` 62 | 63 | ### Testing the Agent 64 | 65 | To test if the setup was successful, you can run the agent interactively with a manually set initial prompt using this target 66 | ```bash 67 | make int 68 | ``` 69 | This will start the docker container and attach your shell to it. You can then run 70 | ```bash 71 | python -m agent_code.agent --server true -p "" 72 | ``` 73 | Then open your browser on http://localhost:8080 to follow the agent execution. This will show you an interactive webpage which visualises the events in the event bus / the agent callgraph, allowing you to click on individual events to see them in more detail, read overseer messages, and collapse sub-agent traces. 74 | 75 | ![Agent Loop](figures/agent_execution.png) 76 | 77 | The agent's working directory is mapped to `results/interactive_output` and any files created will be available here on your machine. Agent logs will be in `results/interactive_output/agent_output`. 78 | 79 | You can see more options by doing 80 | ```bash 81 | make help 82 | ``` 83 | or agent arguments wit 84 | ```bash 85 | python -m base_agent.agent --help 86 | ``` 87 | 88 | To further configure the agent, including the choice of LLMs, edit `base_agent/src/config.py`. 89 | 90 | ## Self-Improvement Loop 91 | 92 | To run the self-improvement loop, first inspect the list of benchmarks in the `base_agent/src/benchmarks/__init__.py` file, and make sure that you have uncommented those you want to include. Then do 93 | ```bash 94 | python runner.py 95 | ``` 96 | To see all the options, do 97 | ```bash 98 | python runner.py --help 99 | ``` 100 | Common options might be 101 | ```bash 102 | python runner.py --id 1 --workers 6 103 | ``` 104 | 105 | This will start the agent loop, placing the results in `results/run_`. 106 | 107 | ## Things to work on 108 | 109 | Here are some potential things to try and do with the agent framework: 110 | 111 | - [ ] get the agent to curate / build more of its own benchmarks 112 | - [ ] reduce the variance of self-improvement runs (early features often influence subsequent features) 113 | - [ ] use a stronger LLM to build a scaffold for a weaker LLM 114 | - [ ] find or create more realistic 'software engineering' benchmark tasks 115 | 116 | ## Agent Description 117 | 118 | The agent in `base_agent` is a minimal agent that can just about perform the 119 | meta-improvement task. It lacks efficient file editing tools, devtools such as 120 | tree sitter or LSP integrations, or advanced reasoning structures that would 121 | help it out when performing coding tasks. It has the necessary building blocks 122 | to bootstrap these features and specialise itself to the distribution of 123 | benchmark tasks included. 124 | 125 | Please see `base_agent/README.md` for a more detailed discussion of the base agent framework. 126 | 127 | ``` 128 | ├── base_agent 129 | │ ├── agent_change_log.md 130 | │ ├── agent.py 131 | │ ├── conftest.py 132 | │ ├── description.txt 133 | │ ├── __main__.py 134 | │ ├── pytest.ini 135 | │ ├── README.md 136 | │ ├── requirements.txt 137 | │ ├── src 138 | │ │ ├── agents 139 | │ │ ├── benchmarks 140 | │ │ ├── callgraph 141 | │ │ ├── config.py 142 | │ │ ├── events 143 | │ │ ├── __init__.py 144 | │ │ ├── llm 145 | │ │ ├── oversight 146 | │ │ ├── schemas 147 | │ │ ├── tools 148 | │ │ ├── types 149 | │ │ ├── utils 150 | │ │ └── web_server 151 | │ └── tests 152 | │ ├── agents 153 | │ ├── benchmarks 154 | │ ├── events 155 | │ ├── __pycache__ 156 | │ ├── test_example.py 157 | │ ├── tools 158 | │ └── utils 159 | ├── benchmark_data 160 | ├── results 161 | │ ├── run_ 162 | │ └── interactive_output 163 | ├── runner.py 164 | └── sandbox 165 | ``` 166 | 167 | ### Results Organization 168 | 169 | ``` 170 | results/run_{id}/ 171 | ├── metadata.json # Experiment metadata 172 | └── agent_{i}/ # Agent iteration directory 173 | ├── agent_code/ # Agent implementation 174 | ├── benchmarks/ # Benchmark results 175 | │ └── {bench_name}/ 176 | │ ├── results.jsonl # Per-problem results 177 | │ ├── perf.jsonl # Summary metrics 178 | │ └── traces/ # Detailed traces 179 | └── meta_improvement/ # Improvement logs 180 | ``` 181 | 182 | ## Citation 183 | 184 | ``` 185 | @inproceedings{ 186 | robeyns2025sica, 187 | title={{SICA} A Self-Improving Coding Agent}, 188 | author={Maxime Robeyns, Martin Szummer, and Laurence Aitchison}, 189 | booktitle={ICLR 2025 Workshop on Scaling Self-Improving Foundation Models}, 190 | year={2025}, 191 | url={https://openreview.net/forum?id=rShJCyLsOr} 192 | } 193 | ``` 194 | -------------------------------------------------------------------------------- /base_agent/.gitignore: -------------------------------------------------------------------------------- 1 | ENV_VARS 2 | -------------------------------------------------------------------------------- /base_agent/agent_change_log.md: -------------------------------------------------------------------------------- 1 | # Agent Codebase Change Log 2 | 3 | | Iteration | Change Name | Was Successful? (pending/yes/no) | 4 | |-----------|-------------|----------------------------------| 5 | | 0 | Base Agent | yes | 6 | 7 | 8 | ## Iteration 0: Base Agent 9 | 10 | This is a template iteration which you should follow for following iterations. 11 | 12 | ### Feature Description 13 | 14 | This is to be written at iteration i (in this case, i=0). Describe the intention / motivation / hypothesis behind the change made. 15 | 16 | ### Feature Outcome 17 | 18 | This part is supposed to be written at iteration i + 1 (and potentially updated at subsequent iterations), and comments on the empirical effectiveness of the change. 19 | 20 | ## Iteration 1: 21 | -------------------------------------------------------------------------------- /base_agent/conftest.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import pytest 7 | 8 | # Enable asyncio support for pytest 9 | pytest_plugins = ["pytest_asyncio"] 10 | 11 | # Optional: Define custom command-line options for your markers 12 | def pytest_addoption(parser): 13 | parser.addoption( 14 | "--run-llm", 15 | action="store_true", 16 | default=False, 17 | help="Run tests marked with 'uses_llm'", 18 | ) 19 | parser.addoption( 20 | "--run-slow", 21 | action="store_true", 22 | default=False, 23 | help="Run tests marked with 'slow'", 24 | ) 25 | 26 | # Skip tests based on markers unless the corresponding option is provided 27 | def pytest_collection_modifyitems(config, items): 28 | if not config.getoption("--run-llm"): 29 | skip_llm = pytest.mark.skip(reason="need --run-llm option to run") 30 | for item in items: 31 | if "uses_llm" in item.keywords: 32 | item.add_marker(skip_llm) 33 | if not config.getoption("--run-slow"): 34 | skip_slow = pytest.mark.skip(reason="need --run-slow option to run") 35 | for item in items: 36 | if "slow" in item.keywords: 37 | item.add_marker(skip_slow) 38 | -------------------------------------------------------------------------------- /base_agent/description.txt: -------------------------------------------------------------------------------- 1 | This is the base, v0 agent that is used as a starting point. 2 | -------------------------------------------------------------------------------- /base_agent/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | pythonpath = . 3 | # --ff for previously failed first 4 | # -l for print state on failure 5 | # -x for stop on first failure 6 | # -s for show stdout while testing 7 | # -v for verbose (e.g. show test names) 8 | # -n for n threadsafe parallel workers 9 | addopts = -l -x --ff -s -v 10 | testpaths = tests 11 | filterwarnings = ignore::DeprecationWarning 12 | asyncio_default_fixture_loop_scope = function 13 | markers = 14 | uses_llm: marks tests as using llms (run with '--run-llm') 15 | asyncio: marks tests as asynchronous 16 | integration: marks tests as integration tests 17 | slow: marks tests that run slowly 18 | performance: marks tests that benchmark performance (run with '-m performance') 19 | -------------------------------------------------------------------------------- /base_agent/requirements.txt: -------------------------------------------------------------------------------- 1 | jsonlines 2 | cryptography 3 | datasets 4 | tiktoken 5 | pydantic[email] 6 | pydantic-settings 7 | python-dotenv 8 | anthropic[vertex]==0.42.0 9 | tabulate 10 | openai 11 | json-repair 12 | rich 13 | jinja2 14 | fastapi 15 | uvicorn[standard] 16 | GitPython 17 | diff-match-patch 18 | swebench 19 | duckduckgo-search 20 | scipy 21 | sympy 22 | google-genai 23 | googlesearch-python 24 | pytest 25 | pytest-asyncio 26 | google-cloud-aiplatform 27 | -------------------------------------------------------------------------------- /base_agent/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/__init__.py -------------------------------------------------------------------------------- /base_agent/src/agents/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | The agents module defines the agents that can be composed and called to 8 | construct the broader scaffolding system. An agent might be thought of as a 9 | function in a program in the sense that they can be invoked, invoke other 10 | agents themselves, be composed and so forth - indeed, we maintain a 'callgraph' 11 | of the agent calls in the system. 12 | 13 | Individually, an "agent" is just a class that is used to carefully compose an 14 | LLM's context. The way the LLM sees the context is as follows: 15 | 16 | - a system prompt section, in which the "agent's" definition, goals, and 17 | available tools and sub-agents are defined 18 | - the first "user" message, referred to as the core prompt section, which is 19 | defined by the agent itself and which pertains to the way in which the agent 20 | should go about its execution; what sequence of steps it should follow, what 21 | it should focus on, what outcomes it should try to achieve. This is also 22 | where we put visualisations of system state such as file trees and file 23 | viewers. 24 | - the "assistant" message, which contains the agent's response and consists of 25 | alternating sequences of thought and tool or sub-agent calls. The 26 | 'function calling interface' for tools and sub-agents is very similar: 27 | consisting of an XML sequence whose last closing tag is a stop token. After 28 | being generated, the LLM will stop, the contents of the XML will be pasrsed 29 | to identify the tool or sub-agent name, and the arguments provided, these 30 | will be validated, the tool or sub-agent will be run, and the response will 31 | be serialised. These will then be concatenated to the previously generated 32 | assistant message, and the LLM will be called again with this as the 33 | assistant "pre-fill". 34 | 35 | Note, the way this is implemented, and the programming model to maintain, is 36 | that each agent maintains an 'event stream', published to the event bus. This 37 | is a list of events (such as new assistant messages, tool calls and results, 38 | agent calls and results, file events, overseer notifications and so forth) 39 | which describes the exection of the agent. The assistant message is 40 | reconstructed by filtering this event stream and concatenating the values. At a 41 | basic level, just the assistant messages and tool / agent results can be 42 | concatenated, although other event types can be included. For instance, the 43 | file open event may also be included here (with a view of the file content) in 44 | order to save re-generating the core prompt, which would cause a KV cache miss. 45 | By only appending to the LLM agent's context, we can avoid breaking to the 46 | cache, at the cost of lengthening it and potentially duplicating content - 47 | eventually it becomes more cost effective to consolidate all this file state 48 | into the core prompt, shorten the prompt yet re-calculate the KV cache. 49 | 50 | Also note that overseer notification events are handled slightly differently. 51 | When reconstructing the event stream, we stop the current assistant message, 52 | add the overseer notification in a new 'user' message, before continuing with 53 | the rest of the events in a new assistant pre-fill message. 54 | """ 55 | -------------------------------------------------------------------------------- /base_agent/src/agents/implementations/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Built-in agent agents providing core capabilities.""" 7 | 8 | from ..base_agent import BaseAgent, AgentResult 9 | 10 | 11 | class DemoAgent(BaseAgent): 12 | """Agent for constructing examples in tools""" 13 | 14 | AGENT_NAME = "demo_agent" 15 | AGENT_DESCRIPTION = "a dummy agent for demonstration" 16 | SYSTEM_PROMPT = "" 17 | 18 | async def construct_core_prompt(self) -> str: 19 | return "" 20 | 21 | @classmethod 22 | def generate_examples(cls) -> list[tuple["BaseAgent", AgentResult]]: 23 | return [] 24 | -------------------------------------------------------------------------------- /base_agent/src/agents/implementations/problem_solver.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from typing import List 7 | from pathlib import Path 8 | from pydantic import Field 9 | 10 | 11 | from .reasoner import ReasoningAgent 12 | from ..base_agent import BaseAgent 13 | from ...config import settings 14 | from ...tools.calculator import Calculator 15 | from ...tools.directory_tools import ViewDirectory 16 | from ...tools.execute_command import ExecuteCommand 17 | from ...tools.file_tools import OpenFile, CloseFile 18 | from ...tools.edit_tools import OverwriteFile 19 | from ...tools.ripgrep_tool import RipGrepTool 20 | from ...tools.committee_design import ReviewCommittee 21 | from .coder import CodingAgent 22 | from ...utils.metrics import make_random_agent_metrics 23 | from ...types.agent_types import AgentStatus, AgentResult 24 | from ...events.event_bus_utils import get_problem_statement 25 | 26 | 27 | class ProblemSolvingAgent(BaseAgent): 28 | """ 29 | A multi-purpose problem-solving agent with access to all tools and capabilities. 30 | 31 | This agent can: 32 | 1. Analyze and decompose complex problems 33 | 2. Plan and execute solutions systematically 34 | 3. Use a wide range of tools and agents 35 | 4. Validate and refine solutions 36 | 5. Handle errors and edge cases 37 | 6. Document and explain its process 38 | """ 39 | 40 | AGENT_NAME = "general_problem_solver" 41 | 42 | AGENT_DESCRIPTION = """ 43 | Your default agent for all tasks. Highly versatile, with broad tool access. Best for tasks requiring multiple capabilities or when specific agent choice isn't obvious. 44 | 45 | Note that the agent will not have the context that you have / be able to see the initial problem statement verbatim. It is up to you to accurately relay this to the sub-agent, or decompose it into sub-tasks if it is very long and repeating it verbatim would be slow and costly. 46 | 47 | Example capabilities 48 | - Problem decomposition and analysis 49 | - General purpose writing tasks 50 | - Basic Coding (although not specialised) 51 | - Quick System and file operations 52 | - Mathematical computation 53 | - Running shell commands 54 | 55 | Choose when: 56 | - Specific agent isn't clearly better 57 | - Need flexible approach 58 | 59 | Avoid when: 60 | - Task fits squarely in another agent's specialty 61 | - Requires deep domain expertise""" 62 | 63 | SYSTEM_PROMPT = """You are a very-competent problem solver who finds solutions swiftly and effectively. 64 | 65 | You should 66 | 1. Understand the sense of the problem you have been provided 67 | 2. Identify the optimal tools and methods you can use to solve your task 68 | 3. Swiftly execute on the problem 69 | 4. Continuously validate and check your work 70 | 71 | Aim for simple, elegant and correct solutions. 72 | """ 73 | 74 | # Available tools - complete access to all tools 75 | # NOTE: ExitAgent and ReturnResult are automatically included 76 | AVAILABLE_TOOLS = { 77 | Calculator, 78 | ViewDirectory, 79 | ExecuteCommand, 80 | OpenFile, 81 | CloseFile, 82 | OverwriteFile, 83 | RipGrepTool, 84 | ReviewCommittee, 85 | } 86 | 87 | # Available agents 88 | # AVAILABLE_AGENTS = set() 89 | AVAILABLE_AGENTS = {ReasoningAgent, CodingAgent} 90 | 91 | HAS_FILEVIEW = True 92 | 93 | MODEL = settings.MODEL 94 | TEMPERATURE = 0.666 95 | 96 | # Agent parameters 97 | problem_statement: str = Field( 98 | ..., 99 | description="The problem or request you want the problem solver agent to solve", 100 | ) 101 | previous_agent_runs: List[str] = Field( 102 | default=[], 103 | description="A list of descriptions of previous work undertaken by other agents, the context from which this agent would benefit from knowing. This helps to avoid duplicate work.", 104 | ) 105 | requirements: List[str] = Field( 106 | default=[], 107 | description="A list of very specific and low-level criteria which must be met or become valid for the sub-agent to consider its work done.", 108 | ) 109 | 110 | def __init__( 111 | self, 112 | parent: BaseAgent | None = None, 113 | workdir: Path | None = None, 114 | logdir: Path | None = None, 115 | debug_mode: bool = False, 116 | **data, 117 | ): 118 | super().__init__( 119 | parent=parent, workdir=workdir, logdir=logdir, debug_mode=debug_mode, **data 120 | ) 121 | 122 | async def construct_core_prompt(self) -> str: 123 | """Construct the core prompt for problem solving.""" 124 | 125 | # initial_request = await get_problem_statement() 126 | # if initial_request is None or initial_request == "": 127 | # raise ValueError( 128 | # "The initial request was not provided to the problem solver" 129 | # ) 130 | 131 | prompt = f"""Here is the problem you have been asked to solve: 132 | 133 | 134 | {self.problem_statement} 135 | 136 | """ 137 | 138 | if self.previous_agent_runs: 139 | prompt += "\n\nWork Previously Completed:" 140 | prompt += "\nYou should pay attention to this list to avoid duplicating work. Also note that this list is for work completed by other agents, which aren't 100% reliable, so treat claims with appropriate caution, and verify accordingly." 141 | for work in self.previous_agent_runs: 142 | prompt += f"\n- {work}" 143 | 144 | if self.requirements: 145 | prompt += "\n\nSpecific requirements which must be met before you can consider the work 'done':" 146 | for req in self.requirements: 147 | prompt += f"\n- {req}" 148 | 149 | prompt += "\n\nReturn your answer when complete." 150 | 151 | return prompt 152 | 153 | @classmethod 154 | def generate_examples(cls) -> list[tuple["BaseAgent", AgentResult]]: 155 | """Generate example uses of the tool with their expected outputs.""" 156 | examples = [ 157 | # Example 1: Mathematical Problem Solving 158 | ( 159 | cls( 160 | problem_statement="""Solve the following system of equations: 161 | 3x + 2y = 12 162 | x - y = 1""", 163 | requirements=[ 164 | "Show the full answer derivation", 165 | "Verify the solution numerically using Python", 166 | ], 167 | ), 168 | AgentResult( 169 | agent_name=cls.AGENT_NAME, 170 | status=AgentStatus.SUCCESS, 171 | result="""Solution found: x = 4, y = 3 172 | 173 | Process: 174 | 1. Used elimination method 175 | 2. Verified by substitution in a Python script 176 | 3. Checked both equations 177 | 4. All validation criteria met""", 178 | metrics=make_random_agent_metrics( 179 | tools_enabled=True, agents_enabled=True 180 | ), 181 | ), 182 | ), 183 | # Example 2: Code Analysis and Modification 184 | # ( 185 | # cls( 186 | # problem_statement="""Fix the performance issue in process_data() agent: 187 | # 188 | # - Current implementation uses O(n²) time 189 | # - Need to optimize to O(n) complexity 190 | # - Maintain existing API contract""", 191 | # requirements=[ 192 | # "Keep existing agent signature", 193 | # "Maintain thread safety", 194 | # "Add performance tests", 195 | # ], 196 | # ), 197 | # AgentResult( 198 | # agent_name=cls.AGENT_NAME, 199 | # status=AgentStatus.SUCCESS, 200 | # result="""Optimized process_data() agent: 201 | # 202 | # 1. Analyzed existing implementation 203 | # 2. Identified quadratic loop pattern 204 | # 3. Refactored to use hash table 205 | # 4. Added performance tests 206 | # 5. Verified thread safety 207 | # 6. Maintained API compatibility 208 | # 209 | # Performance improved: 210 | # - Before: O(n²) time, O(1) space 211 | # - After: O(n) time, O(n) space 212 | # - Verified with test suite""", 213 | # metrics=make_random_agent_metrics( 214 | # tools_enabled=True, agents_enabled=True 215 | # ), 216 | # ), 217 | # ), 218 | ] 219 | return examples 220 | -------------------------------------------------------------------------------- /base_agent/src/agents/implementations/review_committee_member.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from pathlib import Path 7 | from pydantic import Field 8 | 9 | 10 | from ..base_agent import BaseAgent 11 | from ...config import settings 12 | from ...tools.directory_tools import ViewDirectory 13 | from ...tools.file_tools import OpenFile, CloseFile 14 | from ...tools.ripgrep_tool import RipGrepTool 15 | from ...utils.metrics import make_random_agent_metrics 16 | from ...types.agent_types import AgentStatus, AgentResult 17 | from ...types.llm_types import Model 18 | from .reasoner import ReasoningAgent 19 | 20 | 21 | class CommitteeMember(BaseAgent): 22 | """ 23 | A simple review committee agent, with read-only access to the project. 24 | """ 25 | 26 | AGENT_NAME = "meta_agent_design_reviewer" 27 | 28 | AGENT_DESCRIPTION = """A meta-agent design review committee member. Called from the committee_design tool.""" 29 | 30 | SYSTEM_PROMPT = """You are a member of a Meta-Agent design review committee, tasked with evaluating a coding agent's design proposal, about how to improve a coding agent system, before it begins work on the implementation. Your role is to provide a detailed, constructive and reasonable critique that ensures the proposed design avoids commonly identified pathologies in coding agent design, and is robust, practical, and aligned with the goals of the self-improving coding agent. 31 | 32 | Approach the review with a critical yet collaborative mindset, drawing on established engineering principles such as simplicity (delete unnecessary parts), conceptual integrity (a cohesive whole), and testability. 33 | 34 | You must ensure that the design is grounded in making the coding agent system better at writing softare, advocating for things like 35 | - improving the mechanics of writing the code files: more efficient file editing strategies and tools 36 | - building reasoning and organisational structures which guide the agent to generate better code 37 | - things which improve the speed with which the agent is able to complete code tasks 38 | - features which improve the quality of the written code: such as improving the generated code's formatting and structure, utillities for robust and efficient testing, or which enhance the maintainability of the code 39 | 40 | Focus on the following desiderata: 41 | - Clarity: Is the proposal understandable and well-articulated? 42 | - Feasibility: Can it be realistically implemented given constraints? 43 | - Robustness: Does it handle real-world challenges (e.g., edge cases, failures)? 44 | - Quality: Does it reflect good design and testing practices for long-term value? 45 | - Grounding: Is it supported by executable feedback (e.g., tests) to verify its claims? 46 | 47 | Provide a structured evaluation: identify strengths, flag weaknesses, and suggest actionable improvements. Avoid vague or frivolous feedback-every critique should tie back to the project's success. Your specialized role will guide your focus, but always consider the proposal as a whole.""" 48 | 49 | # Available tools 50 | # NOTE: ExitAgent and ReturnResult are automatically included 51 | # We limit ourselves to 'read only' tools. 52 | AVAILABLE_TOOLS = { 53 | ViewDirectory, 54 | OpenFile, 55 | CloseFile, 56 | RipGrepTool, 57 | } 58 | 59 | # Available agents 60 | # AVAILABLE_AGENTS = {ReasoningAgent} 61 | AVAILABLE_AGENTS = set() 62 | 63 | HAS_FILEVIEW = True 64 | 65 | MODEL = settings.MODEL 66 | TEMPERATURE = 0.666 67 | 68 | # Agent parameters 69 | proposal: str = Field( 70 | ..., 71 | description="The full proposal to review", 72 | ) 73 | context: str = Field( 74 | ..., 75 | description="The motivation and context for understanding the plan", 76 | ) 77 | specialisation: str = Field( 78 | ..., description="The specialisation of this committee member" 79 | ) 80 | model: Model = Field(default=Model.SONNET_35) 81 | 82 | def __init__( 83 | self, 84 | parent: BaseAgent | None = None, 85 | workdir: Path | None = None, 86 | logdir: Path | None = None, 87 | debug_mode: bool = False, 88 | **data, 89 | ): 90 | super().__init__( 91 | parent=parent, workdir=workdir, logdir=logdir, debug_mode=debug_mode, **data 92 | ) 93 | 94 | async def construct_core_prompt(self) -> str: 95 | """Construct the core prompt for the committee member.""" 96 | 97 | prompt = f"""{self.specialisation} 98 | 99 | Here is the agent's self-provided goals and context surrounding the plan 100 | 101 | {self.context} 102 | 103 | 104 | Here is the design proposal you have been asked to review: 105 | 106 | 107 | {self.proposal} 108 | 109 | 110 | You should read the README.md file first to get the full context of this self-improving coding agent project. 111 | You should then view the agent_change_log.md to get an idea of what (if anything) has already been tried by the coding agent as it attempts to improve itself, as measured by the benchmark performance. 112 | You can also quickly view any other code files that you need to get context on the proposal. 113 | 114 | Then, craft your review. Don't spend too long opening other files and doing research. Move swiftly. Note that you MUST provide your full review in the return_result tool since this is how it is communicated back. Anything not put in the return_result tool will not be seen by the agent. 115 | 116 | DO NOT attempt the task yourself, and avoid calling tools unless you absolutely need to. Then, simply provide your review in the return_result tool and complete. 117 | """ 118 | 119 | return prompt 120 | 121 | @classmethod 122 | def generate_examples(cls) -> list[tuple["CommitteeMember", AgentResult]]: 123 | """Generate example uses of the tool with their expected outputs. 124 | 125 | Note that the committee member is deterministically invoked (for now) 126 | so these examples won't be used. 127 | """ 128 | examples = [] 129 | return examples 130 | -------------------------------------------------------------------------------- /base_agent/src/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from typing import Type 7 | from collections import OrderedDict 8 | 9 | from .base import BaseBenchmark 10 | from .gpqa import GPQABenchmark 11 | from .aime import AIMEBenchmark 12 | from .drop import DROPBenchmark 13 | from .math import MATHBenchmark 14 | from .gsm8k import GSM8KBenchmark 15 | from .gsm_ic import GSMICBenchmark 16 | from .refute import RefuteBenchmark 17 | from .arc_agi import ARCAGIBenchmark 18 | from .humaneval import HumanEvalBenchmark 19 | from .file_editing import FileEditingBenchmark 20 | from .aiq_benchmark import AIQBenchmark 21 | from .livecodebench import LiveCodeBenchmark 22 | from .symbol_location import SymbolLocationBenchmark 23 | from .swebench_verified import SWEBenchBenchmark 24 | from .aiq_project_benchmarks import ( 25 | LinalgAIQBenchmark, 26 | CSVParsingAIQBenchmark, 27 | MessagingAppAIQBenchmark, 28 | DistKVStoreAIQBenchmark, 29 | ) 30 | 31 | # Important, append new benchmarks to the end of this 32 | benchmark_registry: OrderedDict[str, Type[BaseBenchmark]] = OrderedDict( 33 | [ 34 | (GSM8KBenchmark.name, GSM8KBenchmark), 35 | # (DROPBenchmark.name, DROPBenchmark), 36 | # (ARCAGIBenchmark.name, ARCAGIBenchmark), 37 | # (MATHBenchmark.name, MATHBenchmark), 38 | # (GSMICBenchmark.name, GSMICBenchmark), 39 | # (FileEditingBenchmark.name, FileEditingBenchmark), 40 | # (SWEBenchBenchmark.name, SWEBenchBenchmark), 41 | # (HumanEvalBenchmark.name, HumanEvalBenchmark), 42 | # (AIMEBenchmark.name, AIMEBenchmark), 43 | # (GPQABenchmark.name, GPQABenchmark), 44 | # (LiveCodeBenchmark.name, LiveCodeBenchmark), 45 | # (SymbolLocationBenchmark.name, SymbolLocationBenchmark), 46 | # (RefuteBenchmark.name, RefuteBenchmark), 47 | # (AIQBenchmark.name, AIQBenchmark), 48 | # (LinalgAIQBenchmark.name, LinalgAIQBenchmark), 49 | # (CSVParsingAIQBenchmark.name, CSVParsingAIQBenchmark), 50 | # (MessagingAppAIQBenchmark.name, MessagingAppAIQBenchmark), 51 | # (DistKVStoreAIQBenchmark.name, DistKVStoreAIQBenchmark), 52 | ] 53 | ) 54 | -------------------------------------------------------------------------------- /base_agent/src/benchmarks/aime.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import random 7 | import logging 8 | 9 | from pathlib import Path 10 | from datasets import load_dataset 11 | from dataclasses import dataclass 12 | 13 | from .base import BaseBenchmark, Problem 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | @dataclass 20 | class AIMEExample: 21 | """A single AIME example.""" 22 | problem_id: str # e.g., "2024-I-1" 23 | problem: str 24 | solution: str 25 | answer: int 26 | 27 | @classmethod 28 | def from_raw(cls, example: dict) -> "AIMEExample": 29 | """Create an AIMEExample from a raw dataset example.""" 30 | return cls( 31 | problem_id=str(example["ID"]), 32 | problem=example["Problem"].strip(), 33 | solution=example["Solution"].strip(), 34 | answer=int(example["Answer"]) # AIME answers are always integers 35 | ) 36 | 37 | 38 | class AIMEBenchmark(BaseBenchmark): 39 | """Benchmark for the American Invitational Mathematics Examination (AIME) 2024 dataset. 40 | 41 | The AIME is a prestigious high school mathematics competition known for its challenging 42 | mathematical problems. All answers in AIME are integers. 43 | """ 44 | 45 | name = "aime" 46 | 47 | def __init__(self, seed: int | None = 1, subset_size: int | None = 20): 48 | super().__init__(seed, subset_size) 49 | 50 | # Load dataset from HuggingFace 51 | dataset = load_dataset("Maxwell-Jia/AIME_2024") 52 | self.test_data = [AIMEExample.from_raw(ex) for ex in dataset["train"]] # Dataset only has train split 53 | 54 | # Create randomized subset if requested 55 | if subset_size is not None: 56 | random.seed(seed) 57 | self.test_data = random.sample(self.test_data, subset_size) 58 | 59 | # Convert to Problem instances 60 | self._data = [ 61 | Problem( 62 | problem_id=ex.problem_id, 63 | statement=ex.problem, 64 | answer=ex.answer, 65 | answer_discussion=ex.solution, 66 | ) 67 | for ex in self.test_data 68 | ] 69 | 70 | @property 71 | def problems(self) -> list[Problem]: 72 | return self._data 73 | 74 | async def score_problem( 75 | self, 76 | problem: Problem, 77 | agent_workdir: str, 78 | agent_answer_dir: str, 79 | container_name: str, 80 | ) -> tuple[float, str | None, str | None]: 81 | """Score the answer to the problem. 82 | 83 | Since AIME answers are always integers, we can do exact matching without 84 | any floating-point comparison. 85 | 86 | Returns: 87 | tuple of: 88 | - score (0.0 or 1.0) 89 | - error message (if any) 90 | - solution discussion 91 | """ 92 | try: 93 | answer_path = Path(agent_answer_dir) / "answer.txt" 94 | llm_answer = answer_path.read_text().strip() 95 | 96 | # Clean the answer by removing any commas and whitespace 97 | llm_answer = llm_answer.replace(",", "").replace(" ", "") 98 | 99 | # Convert to integer and compare exactly 100 | try: 101 | answer_int = int(llm_answer) 102 | if answer_int == problem.answer: 103 | return 1.0, None, problem.answer_discussion 104 | return 0.0, None, problem.answer_discussion 105 | except ValueError: 106 | return 0.0, "Answer must be an integer", problem.answer_discussion 107 | 108 | except Exception as e: 109 | logger.debug(f"Error in AIME scoring: {e}") 110 | return 0.0, str(e), problem.answer_discussion 111 | 112 | 113 | if __name__ == "__main__": 114 | import tempfile 115 | 116 | def run_test_case(benchmark: AIMEBenchmark, answer_dir: Path, 117 | ground_truth: int, agent_answer: str, should_pass: bool): 118 | """Helper function to run a single test case""" 119 | print(f"\nTESTING: '{ground_truth}' vs '{agent_answer}' (should_pass={should_pass})") 120 | 121 | # Use first problem as template but override answer 122 | problem = benchmark.problems[0] 123 | problem.answer = ground_truth 124 | problem.answer_discussion = "Test discussion" 125 | 126 | answer_file = answer_dir / "answer.txt" 127 | answer_file.write_text(agent_answer) 128 | 129 | score, error, _ = benchmark.score_problem( 130 | problem, str(answer_dir.parent), str(answer_dir), "test" 131 | ) 132 | 133 | assert score == (1.0 if should_pass else 0.0), \ 134 | f"Failed: '{ground_truth}' vs '{agent_answer}' got {score}, expected {1.0 if should_pass else 0.0}" 135 | if error: 136 | print(f"Error message: {error}") 137 | 138 | # Create test environment 139 | benchmark = AIMEBenchmark() 140 | 141 | with tempfile.TemporaryDirectory() as tmpdir: 142 | answer_dir = Path(tmpdir) / "answers" 143 | answer_dir.mkdir() 144 | 145 | print("\nTesting basic integer answers...") 146 | test_cases = [ 147 | (42, "42", True), 148 | (42, "42.0", False), # Must be exact integer 149 | (1000, "1,000", True), # Allow commas 150 | (1000, "1000", True), 151 | (1000, " 1000 ", True), # Allow whitespace 152 | (42, "abc", False), # Non-numeric 153 | (-123, "-123", True), # Negative numbers 154 | (0, "0", True), 155 | (0, "0.0", False), 156 | (42, "41", False), # Wrong answer 157 | ] 158 | for truth, pred, should_pass in test_cases: 159 | run_test_case(benchmark, answer_dir, truth, pred, should_pass) 160 | 161 | # Test that the dataset loads correctly 162 | print("\nTesting dataset loading...") 163 | assert len(benchmark.problems) > 0, "Dataset should not be empty" 164 | assert all(isinstance(p.answer, int) for p in benchmark.problems), \ 165 | "All answers should be integers" 166 | assert all(isinstance(p.problem_id, str) for p in benchmark.problems), \ 167 | "All problem IDs should be strings" 168 | assert all(p.problem_id.startswith("2024-") for p in benchmark.problems), \ 169 | "All problem IDs should start with 2024-" 170 | 171 | # Test subset functionality 172 | print("\nTesting subset functionality...") 173 | subset_size = 5 174 | benchmark_subset = AIMEBenchmark(seed=42, subset_size=subset_size) 175 | assert len(benchmark_subset.problems) == subset_size, \ 176 | f"Subset size should be {subset_size}, got {len(benchmark_subset.problems)}" 177 | 178 | # Test seed reproducibility 179 | print("\nTesting seed reproducibility...") 180 | benchmark_subset1 = AIMEBenchmark(seed=42, subset_size=subset_size) 181 | benchmark_subset2 = AIMEBenchmark(seed=42, subset_size=subset_size) 182 | assert [p.problem_id for p in benchmark_subset1.problems] == \ 183 | [p.problem_id for p in benchmark_subset2.problems], \ 184 | "Same seed should produce same subset" 185 | 186 | # Test different seeds produce different subsets 187 | benchmark_subset3 = AIMEBenchmark(seed=43, subset_size=subset_size) 188 | assert [p.problem_id for p in benchmark_subset1.problems] != \ 189 | [p.problem_id for p in benchmark_subset3.problems], \ 190 | "Different seeds should produce different subsets" 191 | 192 | print("\nAll tests passed! ✨") 193 | -------------------------------------------------------------------------------- /base_agent/src/benchmarks/base.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import jsonlines 7 | 8 | from abc import abstractmethod 9 | from typing import Any, ClassVar 10 | from pathlib import Path 11 | from datetime import datetime 12 | from dataclasses import dataclass, asdict 13 | 14 | 15 | @dataclass 16 | class Problem: 17 | """A single benchmark problem, containing a problem_id, problem statement and answer""" 18 | 19 | problem_id: str 20 | statement: str 21 | answer: Any 22 | answer_discussion: str | None 23 | 24 | 25 | @dataclass 26 | class ProblemResult: 27 | """Complete record of a single problem attempt""" 28 | 29 | problem_id: str 30 | timestamp: str | None = None 31 | score: float | None = None 32 | tokens_used: int | None = None 33 | num_cached_tokens: int | None = None 34 | cost_estimate: float | None = None 35 | wall_time: float | None = None 36 | timed_out: bool = False 37 | cost_threshold_exceeded: bool = False 38 | 39 | def is_complete(self) -> bool: 40 | # Considered complete if it has been scored 41 | return self.score is not None 42 | 43 | def update(self, **kwargs) -> None: 44 | for key, value in kwargs.items(): 45 | if hasattr(self, key): 46 | setattr(self, key, value) 47 | else: 48 | raise ValueError(f"Invalid field {key} in ProblemResult update") 49 | 50 | 51 | class BenchmarkTracker: 52 | def __init__(self, results_path: Path): 53 | self.results_path = results_path 54 | self.results: dict[str, ProblemResult] = self._load_or_create() 55 | 56 | def _load_or_create(self) -> dict[str, ProblemResult]: 57 | results = {} 58 | if self.results_path.exists(): 59 | with jsonlines.open(self.results_path) as reader: 60 | for line in reader: 61 | results[line["problem_id"]] = ProblemResult(**line) 62 | return results 63 | 64 | def start_problem(self, problem_id: str) -> None: 65 | result = ProblemResult( 66 | problem_id=problem_id, timestamp=datetime.now().isoformat() 67 | ) 68 | self.results[problem_id] = result 69 | with jsonlines.open(self.results_path, mode="a") as writer: 70 | writer.write(asdict(result)) 71 | 72 | def update_problem(self, problem_id: str, **kwargs) -> None: 73 | if problem_id not in self.results: 74 | raise KeyError(f"Problem {problem_id} not found") 75 | 76 | self.results[problem_id].update(**kwargs) 77 | 78 | # Rewrite the file with updated results 79 | with jsonlines.open(self.results_path, mode="w") as writer: 80 | writer.write_all(asdict(result) for result in self.results.values()) 81 | 82 | 83 | class BaseBenchmark: 84 | 85 | name: ClassVar[str] 86 | 87 | def __init__(self, seed: int | None = None, subset_size: int | None = None): 88 | self.problem_idx: int = 0 89 | self.seed = seed 90 | self.subset_size = subset_size 91 | 92 | @property 93 | @abstractmethod 94 | def problems(self) -> list[Problem]: 95 | pass 96 | 97 | @abstractmethod 98 | async def score_problem( 99 | self, 100 | problem: Problem, 101 | agent_workdir: str, 102 | agent_answer_dir: str, 103 | container_name: str, 104 | ) -> tuple[float, str | None, str | None]: 105 | """ 106 | Score the answer to the problem; the agent_workdir is an absolute path 107 | to the mapped /home/agent/workdir in the docker container, while the 108 | agent_answer_dir is the absolute path to the mapped logdir in the 109 | docker container, which should contain an answer.txt file. 110 | 111 | To get the submitted answer (if relevant): 112 | 113 | answer_path = Path(agent_answer_dir) / "answer.txt" 114 | llm_answer = answer_path.read_text().strip() 115 | 116 | Return the score (as a float), any parsing errors, and any additional 117 | discussion or information about the answer that can assist the summary. 118 | """ 119 | pass 120 | 121 | def get_problem(self, problem_id: str) -> Problem | None: 122 | """Retrieve a specific problem by ID 123 | Overload this method if there is a more efficient way of locating the 124 | problem by problem_id. 125 | """ 126 | return next((p for p in self.problems if p.problem_id == problem_id), None) 127 | 128 | async def setup_problem( 129 | self, problem: Problem, problem_data_dir: Path, container_name: str 130 | ) -> None: 131 | """Optional hook for performing problem-specific setup. 132 | 133 | This is called before each problem is run. The problem_data_dir 134 | will be mounted in the agent's container at /home/agent/workdir. 135 | 136 | Args: 137 | problem: The problem being run 138 | problem_data_dir: Path to a temporary directory for problem data. 139 | This directory will be mounted in the agent's container. 140 | container_name: The name of the container that the problem will run in 141 | """ 142 | pass # Default no-op implementation 143 | -------------------------------------------------------------------------------- /base_agent/src/benchmarks/gsm8k.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import re 7 | import random 8 | import logging 9 | 10 | from typing import List 11 | from pathlib import Path 12 | from datasets import load_dataset 13 | from dataclasses import dataclass 14 | 15 | from .base import BaseBenchmark, Problem 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | @dataclass 21 | class GSM8KExample: 22 | """A single GSM8K example.""" 23 | 24 | question: str 25 | answer: str 26 | steps: list[str] 27 | final_answer: float 28 | 29 | @classmethod 30 | def from_raw(cls, example: dict) -> "GSM8KExample": 31 | """Create a GSM8KExample from a raw dataset example.""" 32 | # Split answer into steps and final answer 33 | answer_parts = example["answer"].split("####") 34 | steps = [s.strip() for s in answer_parts[0].split("\n") if s.strip()] 35 | final_answer = float(answer_parts[1].strip().replace(",", "")) 36 | 37 | return cls( 38 | question=example["question"].strip() + "\n\nWhen submitting your answer, please just give a single number with no accompanying text, units or other markings.", 39 | answer=example["answer"].strip(), 40 | steps=steps, 41 | final_answer=final_answer, 42 | ) 43 | 44 | def extract_calculations(self) -> List[tuple[str, float, float]]: 45 | """Extract arithmetic calculations from the solution steps. 46 | 47 | Returns: 48 | List of tuples containing (expression, expected_result, actual_result) 49 | """ 50 | calculations = [] 51 | pattern = r"<<(.+?)=(.+?)>>" 52 | 53 | for step in self.steps: 54 | matches = re.finditer(pattern, step) 55 | for match in matches: 56 | expr, result = match.groups() 57 | try: 58 | # Clean the expression and make it Python-safe 59 | expr = expr.strip().replace("×", "*").replace("÷", "/") 60 | actual = eval( 61 | expr 62 | ) # Note: eval is safe here as we control the input 63 | expected = float(result) 64 | calculations.append((expr, expected, actual)) 65 | except: 66 | continue 67 | 68 | return calculations 69 | 70 | 71 | class GSM8KBenchmark(BaseBenchmark): 72 | 73 | name = "gsm8k" 74 | 75 | def __init__(self, seed: int | None = None, subset_size: int | None = None): 76 | super().__init__(seed, subset_size) 77 | 78 | # Validate inputs 79 | if subset_size is not None and subset_size <= 0: 80 | raise ValueError("subset_size must be positive") 81 | 82 | dataset = load_dataset("openai/gsm8k", "main") 83 | # self.train_data = [GSM8KExample.from_raw(ex) for ex in dataset["train"]] 84 | self.test_data = [GSM8KExample.from_raw(ex) for ex in dataset["test"]] 85 | 86 | self._data = [ 87 | Problem(problem_id=str(i), statement=p.question, answer=p.final_answer, answer_discussion="\n".join(p.steps)) 88 | for i, p in enumerate(self.test_data) 89 | ] 90 | 91 | # Create randomized subset if requested 92 | if subset_size is not None: 93 | random.seed(seed) 94 | self._data = random.sample(self._data, subset_size) 95 | 96 | @property 97 | def problems(self) -> list[Problem]: 98 | return self._data 99 | 100 | async def score_problem( 101 | self, 102 | problem: Problem, 103 | agent_workdir: str, 104 | agent_answer_dir: str, 105 | container_name: str, 106 | ) -> tuple[float, str | None, str | None]: 107 | try: 108 | answer_path = Path(agent_answer_dir) / "answer.txt" 109 | llm_answer = answer_path.read_text().strip() 110 | 111 | float_answer = float(llm_answer.strip().replace(",", "").replace(" ", "")) 112 | if abs(problem.answer - float_answer) < 1e-7: 113 | return 1.0, None, problem.answer_discussion 114 | else: 115 | return 0.0, None, problem.answer_discussion 116 | except Exception as e: 117 | logger.debug(f"Error in gsm8k scoring: {e}") 118 | return 0.0, str(e), problem.answer_discussion 119 | -------------------------------------------------------------------------------- /base_agent/src/benchmarks/gsm_ic.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import random 7 | import logging 8 | 9 | from pathlib import Path 10 | from datasets import load_dataset 11 | from dataclasses import dataclass 12 | 13 | from .base import BaseBenchmark, Problem 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @dataclass 19 | class GSMICExample: 20 | """A single GSM-IC example with irrelevant context.""" 21 | 22 | question: str 23 | answer: float 24 | n_steps: int 25 | 26 | @classmethod 27 | def from_raw(cls, example: dict) -> "GSMICExample": 28 | """Create a GSMICExample from a raw dataset example.""" 29 | return cls( 30 | question=example["question"].strip(), 31 | answer=float(str(example["answer"]).strip().replace(",", "")), 32 | n_steps=int(example["n_steps"]), 33 | ) 34 | 35 | 36 | class GSMICBenchmark(BaseBenchmark): 37 | """Benchmark for the GSM-IC dataset that tests mathematical reasoning with irrelevant context.""" 38 | 39 | name = "gsm_ic" 40 | 41 | def __init__(self, seed: int | None = None, subset_size: int | None = None): 42 | """Initialize the GSM-IC benchmark. 43 | 44 | Args: 45 | subset_size: Number of problems to use (default 50 to match GSM8K implementation) 46 | """ 47 | super().__init__(seed, subset_size) 48 | 49 | # Validate inputs 50 | if subset_size is not None and subset_size <= 0: 51 | raise ValueError("subset_size must be positive") 52 | 53 | # Load the dataset 54 | dataset = load_dataset("voidful/GSM-IC") 55 | self.data = [GSMICExample.from_raw(ex) for ex in dataset["validation"]] 56 | 57 | # Create problem instances, limiting to subset_size 58 | self._problems = [ 59 | Problem(problem_id=str(i), statement=p.question, answer=p.answer, answer_discussion=None) 60 | for i, p in enumerate(self.data) 61 | ] 62 | 63 | # Create randomized subset if requested 64 | if subset_size is not None: 65 | random.seed(seed) 66 | self._problems = random.sample(self._problems, subset_size) 67 | 68 | @property 69 | def problems(self) -> list[Problem]: 70 | """Return the list of problems.""" 71 | return self._problems 72 | 73 | async def score_problem( 74 | self, 75 | problem: Problem, 76 | agent_workdir: str, 77 | agent_answer_dir: str, 78 | container_name: str, 79 | ) -> tuple[float, str | None, str | None]: 80 | """Score an answer from the LLM against the ground truth. 81 | 82 | Args: 83 | problem: Problem instance containing the ground truth 84 | llm_answer: Answer string from the LLM 85 | 86 | Returns: 87 | 1.0 if the answer is correct, 0.0 otherwise 88 | """ 89 | try: 90 | answer_path = Path(agent_answer_dir) / "answer.txt" 91 | llm_answer = answer_path.read_text().strip() 92 | 93 | # Clean and convert llm answer to float 94 | float_answer = float(llm_answer.strip().replace(",", "").replace(" ", "")) 95 | 96 | # Compare with small tolerance 97 | if abs(problem.answer - float_answer) < 1e-7: 98 | return 1.0, None, None 99 | return 0.0, None, None 100 | 101 | except Exception as e: 102 | logger.debug(f"Error in GSM-IC scoring: {e}") 103 | return 0.0, str(e), None 104 | -------------------------------------------------------------------------------- /base_agent/src/callgraph/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Call graph tracking and oversight for agent functions and tools. 8 | 9 | This module provides: 10 | - Graph data structures for tracking execution 11 | - Visualization utilities 12 | """ 13 | -------------------------------------------------------------------------------- /base_agent/src/callgraph/digraph.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Core directed graph implementation for tracking sub-agent calls. 7 | 8 | Note, in this module, the terms "agent" and "function" are used interchangeably. 9 | """ 10 | 11 | from typing import Dict, Set, List, Optional, Iterator 12 | from datetime import datetime 13 | from dataclasses import dataclass, field 14 | 15 | 16 | @dataclass 17 | class FunctionNode: 18 | """ 19 | Represents a function execution in the call graph. 20 | 21 | This tracks the essential metadata about a function execution, 22 | including timing, results, and relationships to other functions. 23 | """ 24 | 25 | # Core identity 26 | id: str 27 | name: str 28 | parent_id: Optional[str] = None 29 | children: Set[str] = field(default_factory=set) 30 | 31 | # Execution state 32 | started_at: Optional[datetime] = None 33 | completed_at: Optional[datetime] = None 34 | success: bool | None = None 35 | error: Optional[str] = None 36 | 37 | # Function-specific data 38 | args: Dict = field(default_factory=dict) 39 | result: Optional[str] = None 40 | 41 | # Metrics 42 | token_count: int = 0 43 | num_cached_tokens: int = 0 44 | cost: float = 0.0 45 | 46 | @property 47 | def duration_seconds(self) -> Optional[float]: 48 | """Calculate execution duration if completed.""" 49 | if self.completed_at and self.started_at: 50 | return (self.completed_at - self.started_at).total_seconds() 51 | return None 52 | 53 | 54 | class CallGraph: 55 | """ 56 | Directed graph tracking function calls / agent calls. 57 | 58 | The graph maintains parent-child relationships between function 59 | calls and tracks execution metrics for each function. 60 | """ 61 | 62 | def __init__(self): 63 | self.nodes: Dict[str, FunctionNode] = {} 64 | self._root_id: Optional[str] = None 65 | 66 | @property 67 | def root(self) -> Optional[FunctionNode]: 68 | """Get the root node if it exists.""" 69 | return self.nodes.get(self._root_id) if self._root_id else None 70 | 71 | def add_node(self, node: FunctionNode) -> None: 72 | """ 73 | Add a node to the graph. 74 | 75 | If this is the first node, it becomes the root. 76 | """ 77 | self.nodes[node.id] = node 78 | if not self._root_id: 79 | self._root_id = node.id 80 | 81 | def get_node(self, node_id: str) -> Optional[FunctionNode]: 82 | """Get a node by ID.""" 83 | return self.nodes.get(node_id) 84 | 85 | def add_edge(self, from_id: str, to_id: str) -> None: 86 | """Add a directed edge between nodes.""" 87 | if from_id not in self.nodes or to_id not in self.nodes: 88 | raise ValueError("Both nodes must exist in the graph") 89 | 90 | self.nodes[from_id].children.add(to_id) 91 | self.nodes[to_id].parent_id = from_id 92 | 93 | def get_children(self, node_id: str) -> List[FunctionNode]: 94 | """Get all child nodes of a given node.""" 95 | node = self.nodes.get(node_id) 96 | if not node: 97 | return [] 98 | return [self.nodes[child_id] for child_id in node.children] 99 | 100 | def get_ancestors(self, node_id: str) -> List[FunctionNode]: 101 | """Get all ancestors of a node (parent, parent's parent, etc).""" 102 | ancestors = [] 103 | current = self.nodes.get(node_id) 104 | while current and current.parent_id: 105 | parent = self.nodes.get(current.parent_id) 106 | if parent: 107 | ancestors.append(parent) 108 | current = parent 109 | else: 110 | break 111 | return ancestors 112 | 113 | def get_subtree(self, root_id: str) -> Set[str]: 114 | """Get all node IDs in the subtree rooted at root_id.""" 115 | subtree = {root_id} 116 | node = self.nodes.get(root_id) 117 | if node: 118 | for child_id in node.children: 119 | subtree.update(self.get_subtree(child_id)) 120 | return subtree 121 | 122 | def remove_subtree(self, root_id: str) -> None: 123 | """Remove a node and its entire subtree.""" 124 | subtree = self.get_subtree(root_id) 125 | for node_id in subtree: 126 | node = self.nodes.pop(node_id, None) 127 | if node and node.parent_id: 128 | parent = self.nodes.get(node.parent_id) 129 | if parent: 130 | parent.children.remove(node_id) 131 | 132 | def iter_bfs(self) -> Iterator[FunctionNode]: 133 | """Iterate through nodes in breadth-first order.""" 134 | if not self._root_id: 135 | return 136 | 137 | visited = set() 138 | queue = [self._root_id] 139 | 140 | while queue: 141 | node_id = queue.pop(0) 142 | if node_id not in visited: 143 | visited.add(node_id) 144 | node = self.nodes.get(node_id) 145 | if node: 146 | yield node 147 | queue.extend(node.children) 148 | 149 | def iter_dfs(self) -> Iterator[FunctionNode]: 150 | """Iterate through nodes in depth-first order.""" 151 | if not self._root_id: 152 | return 153 | 154 | visited = set() 155 | 156 | def dfs(node_id: str) -> Iterator[FunctionNode]: 157 | if node_id not in visited: 158 | visited.add(node_id) 159 | node = self.nodes.get(node_id) 160 | if node: 161 | yield node 162 | for child_id in node.children: 163 | yield from dfs(child_id) 164 | 165 | yield from dfs(self._root_id) 166 | 167 | def find_cycles(self) -> List[List[str]]: 168 | """Find any cycles in the graph.""" 169 | cycles = [] 170 | visited = set() 171 | path = [] 172 | path_set = set() 173 | 174 | def dfs(node_id: str) -> None: 175 | if node_id in path_set: 176 | cycle_start = path.index(node_id) 177 | cycles.append(path[cycle_start:] + [node_id]) 178 | return 179 | 180 | if node_id in visited: 181 | return 182 | 183 | visited.add(node_id) 184 | path.append(node_id) 185 | path_set.add(node_id) 186 | 187 | node = self.nodes.get(node_id) 188 | if node: 189 | for child_id in node.children: 190 | dfs(child_id) 191 | 192 | path.pop() 193 | path_set.remove(node_id) 194 | 195 | if self._root_id: 196 | dfs(self._root_id) 197 | 198 | return cycles 199 | 200 | def get_execution_metrics(self) -> Dict: 201 | """Get overall execution metrics.""" 202 | total_tokens = sum(n.token_count for n in self.nodes.values()) 203 | num_cached_tokens = sum(n.num_cached_tokens for n in self.nodes.values()) 204 | total_cost = sum(n.cost for n in self.nodes.values()) 205 | 206 | complete_nodes = [ 207 | n for n in self.nodes.values() if n.started_at and n.completed_at 208 | ] 209 | 210 | total_duration = ( 211 | sum( 212 | n.duration_seconds 213 | for n in complete_nodes 214 | if n.duration_seconds is not None 215 | ) 216 | if complete_nodes 217 | else 0 218 | ) 219 | 220 | successes = sum(1 for n in self.nodes.values() if n.success) 221 | failures = sum(1 for n in self.nodes.values() if not n.success) 222 | 223 | return { 224 | "total_functions": len(self.nodes), 225 | "total_tokens": total_tokens, 226 | "num_cached_tokens": num_cached_tokens, 227 | "total_cost": total_cost, 228 | "total_duration": total_duration, 229 | "successful_calls": successes, 230 | "failed_calls": failures, 231 | } 232 | -------------------------------------------------------------------------------- /base_agent/src/config.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from pydantic import field_validator 7 | from pydantic_settings import BaseSettings 8 | 9 | from .types.llm_types import Model 10 | 11 | 12 | class Settings(BaseSettings): 13 | # Basic Agent Configuration 14 | NAME: str = "self_referential_agent" 15 | LOG_LEVEL: str = "INFO" 16 | 17 | MODEL: Model = Model.SONNET_37 18 | REASONING_MODEL: Model = Model.O3_MINI 19 | OVERSIGHT_MODEL: Model = Model.SONNET_37 20 | 21 | @field_validator("MODEL", "REASONING_MODEL", "OVERSIGHT_MODEL", mode="before") 22 | def parse_model(cls, value): 23 | """Convert a string model name into a Model enum instance.""" 24 | if isinstance(value, str): 25 | return Model.from_name(value) 26 | elif isinstance(value, Model): 27 | return value 28 | raise ValueError(f"Invalid model value: {value!r}") 29 | 30 | model_config = { 31 | "env_prefix": "AGENT_", 32 | "case_sensitive": True, 33 | "extra": "allow", # Allow extra fields from environment 34 | } 35 | 36 | 37 | settings = Settings() 38 | -------------------------------------------------------------------------------- /base_agent/src/events/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from .event_bus import EventBus 7 | 8 | __all__ = ["EventBus"] 9 | -------------------------------------------------------------------------------- /base_agent/src/events/event_bus_utils.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Utility functions for working with the event bus. 7 | 8 | Note: these are entirely un-optimised, and many require inefficient iterations 9 | over the event lists to reconstruct 'views' on the event bus. For SWE bench 10 | style tasks, with only up to hundreds of messages, this is not important. 11 | """ 12 | 13 | from typing import Optional, Set, List 14 | 15 | from .event_bus import EventBus 16 | from ..types.tool_types import ToolResult 17 | from ..types.agent_types import AgentResult 18 | from ..types.event_types import EventType, Event, FileOperation, FileEvent 19 | 20 | 21 | async def log_to_stdout(event: Event | FileEvent): 22 | """Print important events to stdout with clear formatting. 23 | 24 | This function is important to format since it is the visual feedback 25 | that the meta-agent will get back when test-running itself 26 | """ 27 | 28 | # Common formatting constants 29 | max_content_len = 50 # Reduced to allow for richer metadata 30 | prefix_width = 10 31 | 32 | def truncate(text: str, length: int = max_content_len) -> str: 33 | """Helper to truncate text and handle newlines""" 34 | text = text.replace("\n", " ") 35 | return f"{text[:length]}..." if len(text) > length else text 36 | 37 | def format_output(prefix: str, content: str, metadata: str = "") -> None: 38 | """Helper to format and print consistent output""" 39 | print( 40 | f"{prefix:<{prefix_width}s} => {content}{' | ' + metadata if metadata else ''}" 41 | ) 42 | 43 | event_content = truncate(str(event.content)) 44 | 45 | if event.type in (EventType.CORE_PROMPT_UPDATE, EventType.SYSTEM_PROMPT_UPDATE): 46 | return 47 | elif event.type == EventType.ASSISTANT_MESSAGE: 48 | format_output(event.type.value, event_content) 49 | elif event.type == EventType.TOOL_CALL: 50 | name = event.metadata.get("name", "unknown tool") 51 | args = truncate(str(event.metadata.get("args", {}))) 52 | format_output(event.type.value, f"{name}, {args}") 53 | elif event.type == EventType.TOOL_RESULT: 54 | result = event.metadata.get("tool_result") 55 | if not isinstance(result, ToolResult): 56 | return 57 | content = f"{result.tool_name}, success: {result.success}, " 58 | content += f"duration: {result.duration:.1f}, {event_content} " 59 | format_output(event.type.value, content) 60 | elif event.type == EventType.AGENT_CALL: 61 | name = event.metadata.get("name", "unknown agent") 62 | args = truncate(str(event.metadata.get("args", {}))) 63 | format_output(event.type.value, f"{name}, {args}") 64 | elif event.type == EventType.AGENT_RESULT: 65 | result = event.metadata.get("agent_result") 66 | if not isinstance(result, AgentResult): 67 | return 68 | name = result.agent_name 69 | status = result.status.value 70 | duration = result.metrics.duration_seconds or 0.0 71 | cost = result.metrics.cost 72 | res = truncate(result.result, 20) 73 | content = f"{name}, status: {status}, duration: {duration:.1f}, cost: ${cost:.4f}, {res}" 74 | format_output(event.type.value, content) 75 | else: 76 | format_output(event.type.value, event_content) 77 | 78 | 79 | async def get_problem_statement() -> str: 80 | """Get the initial problem statement.""" 81 | event_bus = await EventBus.get_instance() 82 | # There should only be one, but we handle the case when it was updated somehow 83 | ps_events = event_bus.get_events_by_type(EventType.PROBLEM_STATEMENT) 84 | return "\n".join(ps.content for ps in ps_events) if len(ps_events) else "" 85 | 86 | 87 | async def get_budget_info() -> dict[str, int | float | None]: 88 | """Get the initial problem statement.""" 89 | event_bus = await EventBus.get_instance() 90 | # There should only be one, but we handle the case when it was updated somehow 91 | ps_events = event_bus.get_events_by_type(EventType.BUDGET_INFO) 92 | if ps_events: 93 | return ps_events[-1].metadata 94 | else: 95 | return dict() 96 | 97 | async def get_latest_sys_prompt_event(agent_id: str | None = None) -> Optional[Event]: 98 | """Get the latest system prompt update event.""" 99 | event_bus = await EventBus.get_instance() 100 | events = ( 101 | event_bus.get_events_by_type(EventType.SYSTEM_PROMPT_UPDATE) 102 | if not agent_id 103 | else event_bus.get_events(agent_id) 104 | ) 105 | system_prompts = [e for e in events if e.type == EventType.SYSTEM_PROMPT_UPDATE] 106 | return system_prompts[-1] if system_prompts else None 107 | 108 | 109 | async def get_latest_core_prompt_event(agent_id: str | None = None) -> Optional[Event]: 110 | """Get the latest core prompt update event.""" 111 | event_bus = await EventBus.get_instance() 112 | events = ( 113 | event_bus.get_events_by_type(EventType.CORE_PROMPT_UPDATE) 114 | if not agent_id 115 | else event_bus.get_events(agent_id) 116 | ) 117 | core_prompts = [e for e in events if e.type == EventType.CORE_PROMPT_UPDATE] 118 | return core_prompts[-1] if core_prompts else None 119 | 120 | 121 | async def get_open_file_set(agent_id: str | None = None) -> Set[FileEvent]: 122 | """Get the set of currently open files.""" 123 | event_bus = await EventBus.get_instance() 124 | open_files: dict[str, FileEvent] = {} 125 | events = ( 126 | event_bus.get_events_by_type(EventType.FILE_EVENT) 127 | if not agent_id 128 | else [ 129 | e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT 130 | ] 131 | ) 132 | 133 | for event in events: 134 | if isinstance(event, FileEvent): 135 | if event.operation == FileOperation.CLOSE and event.path in open_files: 136 | open_files.pop(event.path) 137 | elif event.operation == FileOperation.OPEN: 138 | open_files[event.path] = event 139 | return set(open_files.values()) 140 | 141 | 142 | async def is_file_open(file_path: str, agent_id: str | None = None) -> bool: 143 | """Check if a specific file is open.""" 144 | open_files = await get_open_file_set(agent_id) 145 | return any(file_event.path == file_path for file_event in open_files) 146 | 147 | 148 | async def get_latest_file_event( 149 | file_path: str, 150 | agent_id: str | None = None, 151 | exclude_close: bool = False, 152 | ) -> Optional[FileEvent]: 153 | """Get the most recent file event for a given path.""" 154 | event_bus = await EventBus.get_instance() 155 | events = ( 156 | event_bus.get_events_by_type(EventType.FILE_EVENT) 157 | if not agent_id 158 | else [ 159 | e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT 160 | ] 161 | ) 162 | 163 | file_events = [ 164 | e 165 | for e in events 166 | if isinstance(e, FileEvent) 167 | and e.path == file_path 168 | and (e.operation != FileOperation.CLOSE if exclude_close else True) 169 | ] 170 | return file_events[-1] if file_events else None 171 | 172 | 173 | async def get_file_content_size(agent_id: str | None = None) -> int: 174 | """Calculate total size of content from file events.""" 175 | event_bus = await EventBus.get_instance() 176 | total_size = 0 177 | events = ( 178 | event_bus.get_events_by_type(EventType.FILE_EVENT) 179 | if not agent_id 180 | else [ 181 | e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT 182 | ] 183 | ) 184 | 185 | for event in events: 186 | if isinstance(event, FileEvent): 187 | total_size += len(event.content.encode("utf-8")) 188 | return total_size 189 | 190 | 191 | async def get_subagent_events( 192 | agent_id: str, 193 | event_types: Set[EventType] = set(EventType), 194 | # event_types: Set[EventType] = { 195 | # EventType.ASSISTANT_MESSAGE, 196 | # EventType.TOOL_RESULT, 197 | # EventType.AGENT_RESULT, 198 | # EventType.FILE_EVENT, 199 | # EventType.EXTERNAL_MESSAGE, 200 | # }, 201 | ) -> List[Event]: 202 | """Get events for prefilling assistant messages.""" 203 | event_bus = await EventBus.get_instance() 204 | all_events = event_bus.get_events_in_chain(agent_id) 205 | return [e for e in all_events if e.type in event_types] 206 | -------------------------------------------------------------------------------- /base_agent/src/llm/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """LLM integration module for the Self-Referential Agent System. 7 | 8 | This module provides a unified interface for interacting with various LLM providers 9 | including Anthropic, OpenAI, and DeepSeek. 10 | """ 11 | 12 | import logging 13 | 14 | from .base import ( 15 | Message, 16 | Completion, 17 | CompletionChunk, 18 | TimingInfo, 19 | TextContent, 20 | ToolResultContent, 21 | ) 22 | from .api import create_completion, create_streaming_completion 23 | from .metering import token_meter, get_total_cost 24 | 25 | # Quieten LLM API call logs to make stdout more useful 26 | logging.getLogger("httpx").setLevel(logging.WARNING) 27 | 28 | __all__ = [ 29 | "Message", 30 | "Completion", 31 | "CompletionChunk", 32 | "TimingInfo", 33 | "create_completion", 34 | "create_streaming_completion", 35 | "token_meter", 36 | "get_total_cost", 37 | "TextContent", 38 | "ToolResultContent", 39 | ] 40 | -------------------------------------------------------------------------------- /base_agent/src/llm/base.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Base models and shared functionality for LLM interactions.""" 7 | 8 | from typing import Dict, Optional 9 | from datetime import datetime, timedelta 10 | from pydantic import BaseModel, Field 11 | 12 | from ..types.llm_types import TokenUsage, Model, StopReason, TextContent, ReasoningContent, ToolCallContent, ToolResultContent, ContentTypes 13 | 14 | # NOTE: perhaps move the rest of these classes to the llm_types for consistency 15 | 16 | 17 | class Message(BaseModel): 18 | """A message in a conversation with an LLM.""" 19 | 20 | role: str 21 | content: list[ContentTypes] 22 | name: Optional[str] = None 23 | 24 | def __str__(self) -> str: 25 | parts = [f"Message from role={self.role}"] 26 | for c in self.content: 27 | if isinstance(c, TextContent): 28 | parts.append(f"Text {'-'*10}\n{c.text}") 29 | elif isinstance(c, ReasoningContent): 30 | parts.append(f"Reasoning {'-'*10}\n{c.text}") 31 | elif isinstance(c, ToolCallContent): 32 | parts.append(f"{'-'*10}\nTool call {c.tool_name} (id: {c.call_id}) {c.call_type}: {str(c.tool_args)}\n{'-'*10}") 33 | elif isinstance(c, ToolResultContent): 34 | parts.append(f"{'-'*10}\nTool result {c.tool_name} (id: {c.call_id}): {c.content}\n{'-'*10}") 35 | # return "\n".join([p.replace("\n", "").strip() for p in parts]) 36 | return "\n".join(parts) 37 | 38 | 39 | class TimingInfo(BaseModel): 40 | """Timing information for LLM interactions.""" 41 | 42 | start_time: datetime = Field(description="When the request started") 43 | end_time: datetime = Field(description="When the response completed") 44 | total_duration: timedelta = Field(description="Total duration of the request") 45 | first_token_time: Optional[datetime] = Field( 46 | None, description="When the first token was received" 47 | ) 48 | time_to_first_token: Optional[float] = Field( 49 | None, description="Duration until first token received" 50 | ) 51 | tokens_per_second: Optional[float] = Field( 52 | None, description="Average tokens per second for completion" 53 | ) 54 | 55 | def __str__(self) -> str: 56 | # Format datetime fields to a readable format 57 | fmt = "%Y-%m-%d %H:%M:%S" 58 | parts = [ 59 | f"- Start {self.start_time.strftime(fmt)}, End {self.end_time.strftime(fmt)}", 60 | f"- Duration: {self.total_duration}", 61 | ] 62 | if self.time_to_first_token is not None: 63 | parts.append(f"- TTFT: {self.time_to_first_token:.2f} sec") 64 | if self.tokens_per_second is not None: 65 | parts.append(f"- TPS: {self.tokens_per_second:.2f}") 66 | return "\n".join(parts) 67 | 68 | class CacheMetrics(BaseModel): 69 | """Cache-related metrics.""" 70 | 71 | cache_hits: int = Field(default=0, description="Number of cache hits") 72 | cache_misses: int = Field(default=0, description="Number of cache misses") 73 | cache_writes: int = Field(default=0, description="Number of cache writes") 74 | 75 | @classmethod 76 | def from_dict(cls, data: Optional[Dict[str, int]] = None) -> "CacheMetrics": 77 | """Create metrics from dictionary, preserving provider values.""" 78 | if data is None: 79 | data = {"cache_hits": 0, "cache_misses": 0, "cache_writes": 0} 80 | return cls( 81 | cache_hits=data.get("cache_hits", 0), 82 | cache_misses=data.get("cache_misses", 0), 83 | cache_writes=data.get("cache_writes", 0), 84 | ) 85 | 86 | def to_dict(self) -> Dict[str, int]: 87 | """Convert to dictionary.""" 88 | return self.model_dump() # Use model_dump instead of dict 89 | 90 | 91 | # Completion Types ============================================================ 92 | 93 | class Completion(BaseModel): 94 | """A completion response from an LLM.""" 95 | 96 | id: str 97 | content: list[ContentTypes] | list[list[ContentTypes]] 98 | model: Model # Model identifier string 99 | usage: TokenUsage 100 | timing: TimingInfo 101 | cache_metrics: Optional[Dict[str, int]] = None 102 | stop_reason: StopReason | list[StopReason] = StopReason.COMPLETE 103 | stop_sequence: Optional[str] | list[StopReason] = None 104 | continuation_count: Optional[int] = None 105 | raw_response: Optional[Dict] = Field(default=None, exclude=True) 106 | 107 | @property 108 | def finished_early(self) -> bool: 109 | """Check if completion stopped before finishing normally.""" 110 | return self.stop_reason != StopReason.COMPLETE 111 | 112 | @property 113 | def hit_token_limit(self) -> bool: 114 | """Check if completion stopped due to token length.""" 115 | return self.stop_reason == StopReason.LENGTH 116 | 117 | @property 118 | def errored(self) -> bool: 119 | """Check if completion encountered an error.""" 120 | return self.stop_reason == StopReason.ERROR 121 | 122 | def get_cache_metric(self, key: str, default: int = 0) -> int: 123 | """Get a cache metric value safely.""" 124 | if self.cache_metrics is None: 125 | return default 126 | return self.cache_metrics.get(key, default) 127 | 128 | def calculate_cost(self) -> float: 129 | """Calculate the cost for this completion.""" 130 | return self.usage.calculate_cost(self.model.token_cost) 131 | 132 | def __str__(self) -> str: 133 | comp_str = f"{'='*80}\n" 134 | if isinstance(self.content[0], list): 135 | for i, completion in enumerate(self.content): 136 | comp_str += f"Candidate {i:03d} {70*'-'}\n" 137 | for block in completion: 138 | comp_str += str(block) + "\n" 139 | else: 140 | for block in self.content: 141 | comp_str += str(block) + "\n" 142 | comp_str += f"\n{'-'*80}\n" 143 | comp_str += f"Model: {self.model.id}\n" 144 | comp_str += f"""Tokens used: 145 | - Input {self.usage.input_tokens} (cached: {self.usage.cached_prompt_tokens}, written to cache: {self.usage.cache_write_prompt_tokens}) 146 | - Completion {self.usage.completion_tokens} 147 | - Total {self.usage.total_tokens} 148 | """ 149 | if self.stop_reason != StopReason.COMPLETE: 150 | comp_str += f"Stop reason: {self.stop_reason}\n" 151 | if self.stop_sequence: 152 | comp_str += f"Stop sequence: {self.stop_sequence}\n" 153 | 154 | if self.continuation_count: 155 | comp_str += f"Continuations: {self.continuation_count}\n" 156 | 157 | if self.timing: 158 | comp_str += f"Timing:\n{self.timing}\n" 159 | 160 | comp_str += f"Cost: ${self.calculate_cost():.6f}\n" 161 | 162 | comp_str += f"{'='*80}\n" 163 | return comp_str 164 | 165 | 166 | class CompletionChunk(BaseModel): 167 | """A streaming chunk of a completion response.""" 168 | 169 | id: str 170 | content: str # TODO: make tool call or assistant message string 171 | model: Model # Model identifier string 172 | is_finished: bool = False 173 | timing: Optional[TimingInfo] = None 174 | usage: Optional[TokenUsage] = None 175 | cache_metrics: Optional[Dict[str, int]] = None 176 | stop_reason: Optional[StopReason] = None 177 | continuation_count: Optional[int] = None 178 | raw_response: Optional[Dict] = Field(default=None, exclude=True) 179 | 180 | @property 181 | def finished_early(self) -> bool: 182 | """Check if completion stopped before finishing normally.""" 183 | return bool(self.stop_reason and self.stop_reason != StopReason.COMPLETE) 184 | 185 | @property 186 | def hit_token_limit(self) -> bool: 187 | """Check if completion stopped due to token length.""" 188 | return bool(self.stop_reason and self.stop_reason == StopReason.LENGTH) 189 | 190 | @property 191 | def errored(self) -> bool: 192 | """Check if completion encountered an error.""" 193 | return bool(self.stop_reason and self.stop_reason == StopReason.ERROR) 194 | 195 | def get_cache_metric(self, key: str, default: int = 0) -> int: 196 | """Get a cache metric value safely.""" 197 | if self.cache_metrics is None: 198 | return default 199 | return self.cache_metrics.get(key, default) 200 | 201 | def model_dump(self, **kwargs) -> Dict: 202 | """Override model_dump to exclude raw_response by default.""" 203 | kwargs.setdefault("exclude", {"raw_response"}) 204 | return super().model_dump(**kwargs) 205 | -------------------------------------------------------------------------------- /base_agent/src/llm/metering.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from typing import DefaultDict 7 | from collections import defaultdict 8 | 9 | from ..types.llm_types import TokenUsage, Model 10 | 11 | # A mapping from models to token usage and dollar cost 12 | token_meter: DefaultDict[Model, TokenUsage] = defaultdict(TokenUsage) 13 | budget_info: dict[str, None | int | float] = dict( 14 | start_time=None, # start timestamp 15 | cost_budget=None, # cost budget in USD 16 | time_budget=None, # time budget in seconds 17 | ) 18 | 19 | 20 | def get_total_cost() -> float: 21 | total = 0.0 22 | for model in Model: 23 | total += token_meter[model].calculate_cost(model.token_cost) 24 | return total 25 | 26 | 27 | def get_total_usage() -> TokenUsage: 28 | usage = TokenUsage() 29 | for model in Model: 30 | usage += token_meter[model] 31 | return usage 32 | 33 | 34 | class CallCounter: 35 | def __init__(self): 36 | self.count = 0 37 | 38 | def count_new_call(self): 39 | self.count += 1 40 | 41 | def get_count(self) -> int: 42 | return self.count 43 | 44 | 45 | llm_call_counter = CallCounter() 46 | -------------------------------------------------------------------------------- /base_agent/src/llm/providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Provider-specific implementations for different LLM services.""" 7 | 8 | from .anthropic import AnthropicProvider 9 | from .openai import OpenAIProvider 10 | from .deepseek import DeepSeekProvider 11 | from .fireworks import FireworksProvider 12 | 13 | from .google import GoogleProvider 14 | from .google_rest import GoogleRESTProvider 15 | from .google_oai import GoogleOAIProvider 16 | from .vertex import VertexProvider 17 | 18 | __all__ = [ 19 | "AnthropicProvider", 20 | "OpenAIProvider", 21 | "DeepSeekProvider", 22 | "FireworksProvider", 23 | "GoogleProvider", 24 | "GoogleRESTProvider", 25 | "GoogleOAIProvider", 26 | "VertexProvider", 27 | ] 28 | -------------------------------------------------------------------------------- /base_agent/src/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from .representation import ( 7 | get_schema_representation, 8 | ArgFormat, 9 | dumps, 10 | ) 11 | from .xml_dumps import xml_dumps 12 | from .xml_parsing import xml_str_to_dict 13 | from .json_parsing import json_str_to_dict 14 | 15 | 16 | from typing import Type 17 | from pydantic import BaseModel 18 | 19 | 20 | async def args_str_to_dict( 21 | tool_args: str, guide_obj: Type[BaseModel], arg_format: ArgFormat, root_tag: str 22 | ) -> tuple[dict | None, str | None]: 23 | 24 | # Get schema representation for LLM fixing 25 | if arg_format == ArgFormat.JSON: 26 | return await json_str_to_dict(tool_args, guide_obj) 27 | else: 28 | tool_args = f"<{root_tag}>\n{tool_args}\n" 29 | return await xml_str_to_dict(tool_args, guide_obj, root_tag=root_tag) 30 | -------------------------------------------------------------------------------- /base_agent/src/schemas/representation.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Utilities for consistent representation of Pydantic models. 8 | Provides both JSON and XML based formats optimized for LLM readability. 9 | """ 10 | 11 | import json 12 | import logging 13 | 14 | from enum import Enum 15 | from typing import Type, Dict, Any, Union, get_args, get_origin, Literal, List 16 | from pydantic import BaseModel 17 | 18 | from .xml_dumps import xml_dumps 19 | from ..types.common import ArgFormat 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def get_type_info(field: Any) -> str: 25 | """ 26 | Get human-readable type info for a field. 27 | Handles both Pydantic fields and schema properties. 28 | """ 29 | field_type = field.annotation 30 | parts = [] 31 | 32 | # Handle Optional types first 33 | is_optional = False 34 | if get_origin(field_type) is Union and type(None) in get_args(field_type): 35 | is_optional = True 36 | parts.append("optional") 37 | field_type = next(t for t in get_args(field_type) if t is not type(None)) 38 | 39 | # Handle Literal types 40 | if get_origin(field_type) is Literal: 41 | literal_values = get_args(field_type) 42 | options = ", ".join(f"'{val}'" for val in literal_values) 43 | parts.append(f"one of [{options}]") 44 | elif isinstance(field_type, type) and issubclass(field_type, Enum): 45 | options = ", ".join(f"'{item.name}'" for item in field_type) 46 | parts.append(f"one of [{options}]") 47 | else: 48 | # Get base type 49 | if get_origin(field_type) is list: 50 | item_type = get_args(field_type)[0] 51 | type_str = f"list of {_get_base_type(item_type)}" 52 | elif get_origin(field_type) is dict: 53 | key_type, val_type = get_args(field_type) 54 | type_str = ( 55 | f"dict of {_get_base_type(key_type)} to {_get_base_type(val_type)}" 56 | ) 57 | else: 58 | type_str = _get_base_type(field_type) 59 | parts.append(type_str) 60 | 61 | # Add constraints 62 | constraints = _get_field_constraints(field) 63 | if constraints: 64 | parts.extend(constraints) 65 | 66 | # Handle special cases first 67 | is_required = not is_optional and field.is_required() 68 | 69 | if ( 70 | hasattr(field.default, "__class__") 71 | and field.default.__class__.__name__ == "PydanticUndefinedType" 72 | ) or field.default is Ellipsis: 73 | # Required field (PydanticUndefined or Ellipsis) 74 | parts.append("required") 75 | elif field.default_factory is not None: 76 | # Show empty container for defaults from factory functions 77 | if field_type == Dict or get_origin(field_type) is dict: 78 | parts.append("default: {}") 79 | elif field_type == List or get_origin(field_type) is list: 80 | parts.append("default: []") 81 | elif field.default is not None: 82 | # Explicit default value 83 | parts.append(f"default: {_format_default(field.default)}") 84 | elif is_optional: 85 | # Optional field without default 86 | parts.append("default: null") 87 | 88 | # Add description 89 | if field.description: 90 | parts.append(field.description) 91 | 92 | return ", ".join(parts) 93 | 94 | 95 | def _get_base_type(field_type: Type) -> str: 96 | """Map Python types to schema types.""" 97 | type_map = { 98 | str: "string", 99 | int: "integer", 100 | float: "float", 101 | bool: "boolean", 102 | Any: "any", 103 | } 104 | # For custom classes, use class name 105 | if isinstance(field_type, type): 106 | if issubclass(field_type, BaseModel): 107 | return field_type.__name__.lower() 108 | elif issubclass(field_type, Enum): 109 | return "enum" 110 | return type_map.get(field_type, str(field_type)) 111 | 112 | 113 | def _get_field_constraints(field: Any) -> list[str]: 114 | """Extract field constraints as readable strings.""" 115 | constraints = [] 116 | metadata = field.metadata if hasattr(field, "metadata") else [] 117 | 118 | constraint_names = { 119 | "Gt": ("gt", "greater than"), 120 | "Ge": ("ge", "min"), 121 | "Lt": ("lt", "less than"), 122 | "Le": ("le", "max"), 123 | "MinLength": ("min_length", "min length"), 124 | "MaxLength": ("max_length", "max length"), 125 | "MinItems": ("min_items", "min items"), 126 | "MaxItems": ("max_items", "max items"), 127 | } 128 | 129 | for item in metadata: 130 | item_type = type(item).__name__ 131 | if item_type in constraint_names: 132 | attr_name, label = constraint_names[item_type] 133 | value = getattr(item, attr_name) 134 | if value is not None: 135 | constraints.append(f"{label}: {value}") 136 | 137 | return constraints 138 | 139 | 140 | def _format_default(value: Any) -> str: 141 | """Format default values consistently.""" 142 | # Check for PydanticUndefined (required field with no default) 143 | if ( 144 | hasattr(value, "__class__") 145 | and value.__class__.__name__ == "PydanticUndefinedType" 146 | ): 147 | return "required" 148 | 149 | if isinstance(value, str): 150 | return f"'{value}'" 151 | elif isinstance(value, (list, dict)): 152 | return json.dumps(value) 153 | elif isinstance(value, Enum): 154 | return f"'{value.name}'" 155 | elif callable(value): # e.g., default_factory 156 | return "{}" # Show empty dict/list for factory defaults 157 | elif value is Ellipsis: 158 | return "required" # Explicit handling of Ellipsis 159 | elif value is None: 160 | return "null" 161 | 162 | return str(value).lower() if isinstance(value, bool) else str(value) 163 | 164 | 165 | def get_json_schema_representation(model: Type[BaseModel]) -> str: 166 | """ 167 | Generate a JSON schema representation focused on LLM readability. 168 | """ 169 | fields = model.model_fields 170 | output = [] 171 | 172 | for field_name, field in fields.items(): 173 | type_info = get_type_info(field) 174 | output.append(f'"{field_name}": {type_info}') 175 | 176 | return "{\n " + ",\n ".join(output) + "\n}" 177 | 178 | 179 | def get_xml_schema_representation( 180 | model: Type[BaseModel], root_tag: str | None = None 181 | ) -> str: 182 | """ 183 | Generate an XML schema representation focused on LLM readability. 184 | """ 185 | fields = model.model_fields 186 | # Add angle brackets for root tag if necessary 187 | output = [f"<{root_tag}>"] if root_tag else [] 188 | 189 | for field_name, field in fields.items(): 190 | info = get_type_info(field) 191 | output.append(f"<{field_name}>{info}") 192 | 193 | if root_tag: 194 | output.append(f"") 195 | return "\n".join(output) 196 | 197 | 198 | def get_schema_representation( 199 | cls: Type[BaseModel], arg_format: ArgFormat, root_tag: str | None = None 200 | ) -> str: 201 | if arg_format == ArgFormat.JSON: 202 | return get_json_schema_representation(cls) 203 | else: 204 | return get_xml_schema_representation(cls, root_tag=root_tag) 205 | 206 | 207 | def dumps( 208 | instance: dict, format: ArgFormat, indent: int, root_tag: str | None = None 209 | ) -> str: 210 | if format == ArgFormat.JSON: 211 | return json.dumps(instance, indent=indent) 212 | else: 213 | return xml_dumps(instance, root_tag=root_tag, indent=indent) 214 | -------------------------------------------------------------------------------- /base_agent/src/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | A module of Agent tools 8 | """ 9 | 10 | from .base_tool import BaseTool, tool_registry 11 | from .file_tools import CloseFile, OpenFile 12 | from .edit_tools import OverwriteFile 13 | from .execute_command import ExecuteCommand 14 | from .directory_tools import ViewDirectory 15 | from .ripgrep_tool import RipGrepTool 16 | 17 | # TODO: expand the concept of toolkits and use throughout the agent implementations 18 | toolkits: dict[str, list[BaseTool]] = dict( 19 | coding=[ 20 | ViewDirectory, 21 | ExecuteCommand, 22 | OpenFile, 23 | CloseFile, 24 | OverwriteFile, 25 | RipGrepTool, 26 | ] 27 | ) 28 | -------------------------------------------------------------------------------- /base_agent/src/tools/answer_submission.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import logging 7 | 8 | from pydantic import Field 9 | 10 | from .base_tool import BaseTool 11 | from ..schemas import args_str_to_dict 12 | from ..types.tool_types import ToolResult 13 | from ..types.agent_types import AgentInterface 14 | from ..types.common import ArgFormat 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | 20 | class SubmitAnswer(BaseTool): 21 | """Tool for submitting answers to benchmark questions on disk. 22 | 23 | This is slightly different to the ReturnResult tool which is used to return 24 | a result from the end of agent function call. 25 | """ 26 | 27 | TOOL_NAME = "submit_answer" 28 | TOOL_DESCRIPTION = """Submit an answer to a benchmark question. The answer should be clear and concise. 29 | The tool will attempt to parse your answer according to the benchmark's requirements. 30 | Your answer should be a complete response that directly addresses the question. 31 | It is very important that you do not include any extraneous words or content in the answer field that may make the parsing fail. 32 | """ 33 | 34 | # reasoning: str = Field( 35 | # ..., 36 | # description="Reason about the answer you are going to submit and the correct format in which to do so", 37 | # ) 38 | 39 | answer: str = Field( 40 | ..., description="Your complete answer to the benchmark question", min_length=1 41 | ) 42 | 43 | def __init__(self, calling_agent: AgentInterface, **data): 44 | super().__init__(calling_agent=calling_agent, **data) 45 | 46 | async def run(self) -> ToolResult: 47 | """Execute the answer submission with parsing.""" 48 | try: 49 | if not self._calling_agent._logdir: 50 | return ToolResult( 51 | tool_name=self.TOOL_NAME, 52 | success=False, 53 | errors="System error: no answer path available", 54 | ) 55 | 56 | # Validate answer is not empty or just whitespace 57 | answer = self.answer.strip() 58 | if not answer: 59 | return ToolResult( 60 | tool_name=self.TOOL_NAME, 61 | success=False, 62 | errors="Answer cannot be empty", 63 | ) 64 | 65 | # Save answer to disk 66 | path = self._calling_agent._logdir / "answer.txt" 67 | with open(path, "w") as f: 68 | f.write(answer) 69 | 70 | return ToolResult(tool_name=self.TOOL_NAME, success=True) 71 | 72 | except Exception as e: 73 | return ToolResult( 74 | tool_name=self.TOOL_NAME, 75 | success=False, 76 | errors=f"Failed to save answer: {str(e)}", 77 | ) 78 | 79 | @classmethod 80 | async def args_str_to_dict( 81 | cls, args_str: str, arg_format: ArgFormat = ArgFormat.XML 82 | ) -> tuple[dict | None, str | None]: 83 | args_dict, parse_warnings = await args_str_to_dict( 84 | args_str, guide_obj=cls, arg_format=arg_format, root_tag="TOOL_ARGS" 85 | ) 86 | if args_dict: 87 | args_dict["answer"] = str(args_dict["answer"]) 88 | return args_dict, parse_warnings 89 | 90 | @classmethod 91 | def generate_examples(cls) -> list[tuple["SubmitAnswer", ToolResult]]: 92 | """Generate example uses of the submit_answer tool.""" 93 | from ..agents.implementations import DemoAgent 94 | 95 | return [ 96 | ( 97 | cls( 98 | calling_agent=DemoAgent(), 99 | answer="5", 100 | ), 101 | ToolResult(tool_name=cls.TOOL_NAME, success=True), 102 | ), 103 | ( 104 | cls( 105 | calling_agent=DemoAgent(), 106 | # reasoning="The speed of the car is 10mph", 107 | answer="10 miles per hour", 108 | ), 109 | ToolResult( 110 | tool_name=cls.TOOL_NAME, success=False, errors="Parser error" 111 | ), 112 | ), 113 | ( 114 | cls( 115 | calling_agent=DemoAgent(), 116 | # reasoning="The calculated value is 1,234.5", 117 | answer="1,234.5", 118 | ), 119 | ToolResult(tool_name=cls.TOOL_NAME, success=True), 120 | ), 121 | ] 122 | -------------------------------------------------------------------------------- /base_agent/src/tools/calculator.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import logging 7 | 8 | from pydantic import Field 9 | 10 | from .base_tool import BaseTool 11 | from ..types.tool_types import ToolResult 12 | from ..types.agent_types import AgentInterface 13 | 14 | logger = logging.getLogger(__name__) 15 | logger.setLevel(logging.INFO) 16 | 17 | 18 | class Calculator(BaseTool): 19 | TOOL_NAME = "calculate" 20 | TOOL_DESCRIPTION = """A calculator tool that evaluates mathematical expressions. 21 | Supports basic arithmetic operations (including +, -, *, / and ^) and parentheses. 22 | All expressions must contain only numbers and valid operators.""" 23 | 24 | reasoning: str = Field( 25 | ..., description="Concise resoning about the operation to be performed" 26 | ) 27 | expression: str = Field( 28 | ..., 29 | description="Mathematical expression to evaluate", 30 | pattern=r"^[\d\s\+\-\*\/\(\)\.]+$", 31 | ) 32 | 33 | def __init__(self, calling_agent: AgentInterface, **data): 34 | super().__init__(calling_agent=calling_agent, **data) 35 | 36 | async def run(self) -> ToolResult: 37 | try: 38 | result = eval(self.expression) 39 | return ToolResult( 40 | tool_name=self.TOOL_NAME, success=True, output=str(result) 41 | ) 42 | except Exception as e: 43 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e)) 44 | 45 | @classmethod 46 | def generate_examples(cls) -> list[tuple["Calculator", ToolResult]]: 47 | from ..agents.implementations import DemoAgent 48 | 49 | return [ 50 | ( 51 | cls( 52 | calling_agent=DemoAgent(), 53 | reasoning="The number of fruit is the sum of the two apples and three oranges", 54 | expression="2 + 3", 55 | ), 56 | ToolResult(tool_name=cls.TOOL_NAME, success=True, output=str(5)), 57 | ), 58 | ( 59 | cls( 60 | calling_agent=DemoAgent(), 61 | reasoning="The compound expression will require parentheses", 62 | expression="(3 * 4) / 2", 63 | ), 64 | ToolResult(tool_name=cls.TOOL_NAME, success=True, output=str(6)), 65 | ), 66 | ] 67 | 68 | 69 | if __name__ == "__main__": 70 | import asyncio 71 | from ..agents.implementations import DemoAgent 72 | 73 | async def test(): 74 | c = Calculator(calling_agent=DemoAgent(), reasoning="...", expression="2+2") 75 | result = await c.run() 76 | 77 | assert result.tool_name == Calculator.TOOL_NAME 78 | assert result.success 79 | assert result.duration < 0.5 80 | assert result.output == str(4) 81 | print("All tests pass!") 82 | 83 | asyncio.run(test()) 84 | -------------------------------------------------------------------------------- /base_agent/src/tools/directory_tools.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import logging 7 | 8 | from pathlib import Path 9 | from pydantic import Field 10 | 11 | from .base_tool import BaseTool 12 | from ..utils.file_views import create_filetree, FileTreeOptions 13 | from ..types.tool_types import ToolResult 14 | from ..types.agent_types import AgentInterface 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.INFO) 18 | 19 | 20 | class ViewDirectory(BaseTool): 21 | """Tool to generate a detailed view of directory contents.""" 22 | 23 | TOOL_NAME = "view_directory" 24 | TOOL_DESCRIPTION = """View the contents of a directory with configurable depth and detail options. 25 | 26 | The tool provides a formatted tree view of the directory structure, including: 27 | - File and directory sizes 28 | - Permissions 29 | - Modification times 30 | - Smart collapsing of large directories 31 | - Configurable depth and detail level""" 32 | 33 | directory: str = Field( 34 | ..., 35 | description="The directory path to view", 36 | ) 37 | max_depth: int = Field( 38 | default=2, 39 | description="Maximum depth to traverse (None for unlimited)", 40 | ) 41 | show_hidden: bool = Field( 42 | default=False, 43 | description="Whether to show hidden files and directories", 44 | ) 45 | collapse_threshold: int = Field( 46 | default=15, 47 | description="Number of items before a directory is collapsed (None for no collapsing)", 48 | ) 49 | show_timestamps: bool = Field( 50 | default=False, 51 | description="Whether to show file modification timestamps", 52 | ) 53 | exclude_patterns: list[str] = Field( 54 | default=[], 55 | description="List of glob patterns to exclude (e.g. '.git' or '*.pyc')", 56 | ) 57 | show_full_filepaths: bool = Field( 58 | default=False, 59 | description="Whether to show the full filepaths from the root directory", 60 | ) 61 | 62 | def __init__(self, calling_agent: AgentInterface, **data): 63 | super().__init__(calling_agent=calling_agent, **data) 64 | 65 | async def run(self) -> ToolResult: 66 | try: 67 | path = Path(self.directory) 68 | if not path.exists(): 69 | return ToolResult( 70 | tool_name=self.TOOL_NAME, 71 | success=False, 72 | errors=f"Directory does not exist: {path}", 73 | ) 74 | if not path.is_dir(): 75 | return ToolResult( 76 | tool_name=self.TOOL_NAME, 77 | success=False, 78 | errors=f"Path is not a directory: {path}", 79 | ) 80 | 81 | # Create options for the tree generation 82 | options = FileTreeOptions( 83 | collapse_threshold=self.collapse_threshold, 84 | show_hidden=self.show_hidden, 85 | exclude_patterns=( 86 | self.exclude_patterns 87 | if len(self.exclude_patterns) > 0 or self.show_hidden 88 | else None 89 | ), 90 | show_mtime=self.show_timestamps, 91 | min_dir_level=( 92 | 0 if self.max_depth is None else max(0, self.max_depth - 1) 93 | ), 94 | show_full_path=self.show_full_filepaths, 95 | ) 96 | 97 | # Generate the tree 98 | tree_output = create_filetree(path, options) 99 | 100 | return ToolResult( 101 | tool_name=self.TOOL_NAME, 102 | success=True, 103 | output=f"Directory contents of {path}:\n{tree_output}", 104 | ) 105 | 106 | except Exception as e: 107 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e)) 108 | 109 | @classmethod 110 | def generate_examples(cls) -> list[tuple["ViewDirectory", ToolResult]]: 111 | from ..agents.implementations import DemoAgent 112 | 113 | return [ 114 | # Basic directory view 115 | ( 116 | cls( 117 | calling_agent=DemoAgent(), 118 | directory="/home/agent/workdir", 119 | max_depth=2, 120 | show_hidden=False, 121 | show_timestamps=False, 122 | exclude_patterns=[], 123 | collapse_threshold=20, 124 | ), 125 | ToolResult( 126 | tool_name=cls.TOOL_NAME, 127 | success=True, 128 | output="Directory contents of /home/agent/workdir:\n" 129 | "workdir/ [0755] (1.2MB, 25 files, 5 dirs)\n" 130 | " src/ [0755] (800KB, 15 files, 3 dirs)\n" 131 | " main.py [0644] 50KB\n" 132 | " utils.py [0644] 30KB\n" 133 | " tests/ [0755] (400KB, 10 files, 2 dirs) [collapsed]\n", 134 | ), 135 | ), 136 | # Detailed view with timestamps 137 | ( 138 | cls( 139 | calling_agent=DemoAgent(), 140 | directory="/home/agent/project", 141 | max_depth=1, 142 | show_hidden=True, 143 | show_timestamps=True, 144 | exclude_patterns=[".git", "*.pyc"], 145 | collapse_threshold=15, 146 | ), 147 | ToolResult( 148 | tool_name=cls.TOOL_NAME, 149 | success=True, 150 | output="Directory contents of /home/agent/project:\n" 151 | "project/ [0755] (2.5MB, 40 files, 8 dirs) 2024-01-14 10:00\n" 152 | " .env [0644] 2KB 2024-01-14 09:55\n" 153 | " README.md [0644] 15KB 2024-01-14 09:50\n" 154 | " src/ [0755] (1.5MB, 25 files, 5 dirs) 2024-01-14 10:00\n" 155 | " tests/ [0755] (1MB, 15 files, 3 dirs) 2024-01-14 09:45\n", 156 | ), 157 | ), 158 | ] 159 | -------------------------------------------------------------------------------- /base_agent/src/tools/edit_tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from .overwrite_file import OverwriteFile 7 | 8 | __all__ = ["OverwriteFile"] 9 | -------------------------------------------------------------------------------- /base_agent/src/tools/edit_tools/overwrite_file.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import re 7 | import logging 8 | 9 | from pathlib import Path 10 | from pydantic import Field 11 | 12 | from .utils import edit_preflight_check, generate_edit_event_content 13 | from ...schemas.json_parsing import json_str_to_dict 14 | from ..base_tool import BaseTool, extract_between_patterns 15 | from ...events import EventBus 16 | from ...types.tool_types import ToolResult 17 | from ...types.event_types import EventType, FileOperation, FileEvent 18 | from ...types.agent_types import AgentInterface 19 | from ...types.common import ArgFormat 20 | 21 | logger = logging.getLogger(__name__) 22 | logger.setLevel(logging.INFO) 23 | 24 | 25 | class OverwriteFile(BaseTool): 26 | """Tool to overwrite an existing file or create a new one with content.""" 27 | 28 | TOOL_NAME = "overwrite_file" 29 | TOOL_DESCRIPTION = f"""Use this tool when you want to write content verbatim to a file, either overwriting an existing file or creating a new one. 30 | 31 | For existing files: 32 | - You MUST have called the `open_file` tool to view the file before over-writing it 33 | - This is to make sure we're not over-writing anything of value that needs to be kept 34 | - The entire content will be replaced verbatim with the new content provided 35 | 36 | For new files: 37 | - 'Overwriting' a not-yet-existing file will create it 38 | - The file will be automatically opened in the context window after creation 39 | 40 | Very important notes: 41 | - The content you provide to this tool will be that file's new content. You must make sure to include absolutely everything you still need 42 | - Do NOT "fold" any code sections because this will cause errors. Instead, write out everything verbatim. 43 | 44 | - DO NOT, under any circumstances, call this tool for a file edit that exceeds about 500 lines. It will be slow, inefficient, costly and error-prone. For these types of large-file edits, you should seek to use more efficient editing tools. 45 | - You do not need to write out the file ahed of time before invoking this tool. 46 | """ 47 | 48 | filepath: str = Field( 49 | ..., 50 | description="The full absolute filepath of the file to write. For existing files, must be already open in context window.", 51 | ) 52 | full_unabridged_new_content: str = Field( 53 | ..., 54 | description="The full content to write to the file, which will entirely replace any existing content.", 55 | ) 56 | 57 | def __init__(self, calling_agent: AgentInterface, **data): 58 | super().__init__(calling_agent=calling_agent, **data) 59 | 60 | @classmethod 61 | async def args_str_to_dict( 62 | cls, args_str: str, arg_format: ArgFormat = ArgFormat.XML 63 | ) -> tuple[dict | None, str | None]: 64 | if arg_format == ArgFormat.XML: 65 | # Carefully extract the content, with the assumption that there _will_ 66 | # be conflicting tags. 67 | # First, manually get the content between 68 | filepath_pattern = f"(.*?)" 69 | filepath_match = re.search(filepath_pattern, args_str) 70 | filepath = filepath_match.group(1) if filepath_match else None 71 | if not filepath: 72 | return None, "Could not parse filepath" 73 | 74 | # Find the first opening tag 75 | content = extract_between_patterns( 76 | args_str, "", "" 77 | ) 78 | if not content: 79 | return None, "Could not parse file content" 80 | 81 | return dict(filepath=filepath, full_unabridged_new_content=content), None 82 | else: 83 | return await json_str_to_dict(args_str, guide_obj=cls) 84 | 85 | async def run(self) -> ToolResult: 86 | try: 87 | path = Path(self.filepath) 88 | event_bus = await EventBus.get_instance() 89 | 90 | # Check if file exists 91 | file_exists = path.exists() 92 | 93 | if not file_exists: 94 | # Create directory structure if needed 95 | path.parent.mkdir(parents=True, exist_ok=True) 96 | 97 | # For new files, write content first 98 | try: 99 | path.write_text(self.full_unabridged_new_content) 100 | 101 | event = FileEvent( 102 | type=EventType.FILE_EVENT, 103 | content=self.full_unabridged_new_content, 104 | path=str(path), 105 | operation=FileOperation.OPEN, 106 | ) 107 | 108 | await event_bus.publish(event, self._calling_agent._id) 109 | 110 | return ToolResult( 111 | tool_name=self.TOOL_NAME, 112 | success=True, 113 | output=f"Successfully created new file {path}", 114 | ) 115 | except Exception as e: 116 | return ToolResult( 117 | tool_name=self.TOOL_NAME, 118 | success=False, 119 | errors=f"Failed to create new file {path}: {str(e)}", 120 | ) 121 | else: 122 | # For existing files, verify it's open first 123 | result = await edit_preflight_check( 124 | path, self.TOOL_NAME, self._calling_agent 125 | ) 126 | if result: 127 | return result 128 | 129 | prev_content = path.read_text() 130 | 131 | # Now write new content 132 | try: 133 | path.write_text(self.full_unabridged_new_content) 134 | 135 | diff_content, content_hash = generate_edit_event_content( 136 | prev_content, self.full_unabridged_new_content, str(path) 137 | ) 138 | 139 | event = FileEvent( 140 | type=EventType.FILE_EVENT, 141 | content=diff_content, 142 | path=str(path), 143 | operation=FileOperation.EDIT, 144 | content_hash=content_hash, 145 | mtime=path.stat().st_mtime, 146 | ) 147 | 148 | await event_bus.publish(event, self._calling_agent._id) 149 | 150 | return ToolResult( 151 | tool_name=self.TOOL_NAME, 152 | success=True, 153 | output=f"Successfully overwrote content of {path}", 154 | ) 155 | except Exception as e: 156 | return ToolResult( 157 | tool_name=self.TOOL_NAME, 158 | success=False, 159 | errors=f"Failed to write to file {path}: {str(e)}", 160 | ) 161 | 162 | except Exception as e: 163 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e)) 164 | 165 | @classmethod 166 | def generate_examples(cls) -> list[tuple["OverwriteFile", ToolResult]]: 167 | from ...agents.implementations import DemoAgent 168 | 169 | return [ 170 | # Example 1: Create new file 171 | ( 172 | cls( 173 | calling_agent=DemoAgent(), 174 | filepath="/home/agent/workdir/new_file.txt", 175 | full_unabridged_new_content="Content for the new file", 176 | ), 177 | ToolResult( 178 | tool_name=cls.TOOL_NAME, 179 | success=True, 180 | output="Successfully created new file /home/agent/workdir/new_file.txt", 181 | ), 182 | ), 183 | # Example 2: Overwrite existing file 184 | ( 185 | cls( 186 | calling_agent=DemoAgent(), 187 | filepath="/home/agent/workdir/example.txt", 188 | full_unabridged_new_content="New content for existing file", 189 | ), 190 | ToolResult( 191 | tool_name=cls.TOOL_NAME, 192 | success=True, 193 | output="Successfully overwrote content of /home/agent/workdir/example.txt", 194 | ), 195 | ), 196 | ] 197 | -------------------------------------------------------------------------------- /base_agent/src/tools/edit_tools/utils.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import hashlib 7 | import difflib 8 | 9 | from pathlib import Path 10 | from datetime import datetime, timedelta 11 | 12 | from ...types.tool_types import ToolResult 13 | from ...types.agent_types import AgentInterface, InheritanceFlags 14 | from ...events.event_bus_utils import is_file_open, get_latest_file_event 15 | 16 | 17 | async def edit_preflight_check( 18 | path: Path, tool_name: str, calling_agent: AgentInterface 19 | ) -> ToolResult | None: 20 | inherits_parent_files = ( 21 | InheritanceFlags.OPEN_FILES in calling_agent.INHERITANCE.flags 22 | ) 23 | 24 | file_open: bool = await is_file_open(str(path), calling_agent._id) 25 | if inherits_parent_files and not file_open: 26 | file_open = await is_file_open(str(path), calling_agent._parent_id) 27 | 28 | # Verify file is open 29 | if not file_open: 30 | return ToolResult( 31 | tool_name=tool_name, 32 | success=False, 33 | errors=f"File {path} must be opened first using the open_file tool", 34 | ) 35 | 36 | eps = timedelta(seconds=0.5) 37 | latest_file_event = await get_latest_file_event( 38 | str(path), calling_agent._id, exclude_close=True 39 | ) 40 | # Assumes agent runs are blocking (i.e. all agent file events will be newer 41 | # than parent file events) 42 | if inherits_parent_files and not latest_file_event: 43 | latest_file_event = await get_latest_file_event( 44 | str(path), 45 | calling_agent._parent_id, 46 | exclude_close=True, 47 | ) 48 | 49 | last_mod = datetime.fromtimestamp(path.stat().st_mtime) 50 | if not latest_file_event or last_mod > latest_file_event.timestamp + eps: 51 | last_viewed = ( 52 | latest_file_event.timestamp.strftime("%Y-%m-%d %H:%M:%S") 53 | if latest_file_event 54 | else "Never" 55 | ) 56 | return ToolResult( 57 | tool_name=tool_name, 58 | success=False, 59 | errors=( 60 | f"File {path} was changed at {last_mod.strftime('%Y-%m-%d %H:%M:%S')}, " 61 | f"which is after you last viewed or edited it at {last_viewed}." 62 | "Please view it again to get its latest contents before making your edit." 63 | ), 64 | ) 65 | 66 | 67 | def generate_edit_event_content( 68 | old_content: str, new_content: str, path: str 69 | ) -> tuple[str, str]: 70 | """Generate a diff between old and new content for file events. 71 | 72 | Returns: 73 | tuple[str, str]: A tuple containing (content_for_event, content_hash) 74 | where content_for_event contains the diff and content_hash is the hash of new_content 75 | """ 76 | if not old_content and new_content: 77 | # For new files, return the full content 78 | content_hash = hashlib.sha256(new_content.encode()).hexdigest() 79 | return new_content, content_hash 80 | 81 | # Generate unified diff 82 | old_lines = old_content.splitlines() 83 | new_lines = new_content.splitlines() 84 | 85 | diff = list( 86 | difflib.unified_diff( 87 | old_lines, 88 | new_lines, 89 | fromfile=f"a/{path}", 90 | tofile=f"b/{path}", 91 | lineterm="", 92 | ) 93 | ) 94 | 95 | if diff: 96 | diff_content = "\n".join(diff) 97 | else: 98 | diff_content = "No changes" 99 | 100 | content_hash = hashlib.sha256(new_content.encode()).hexdigest() 101 | return diff_content, content_hash 102 | -------------------------------------------------------------------------------- /base_agent/src/tools/file_tools.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import logging 7 | 8 | from pathlib import Path 9 | from pydantic import Field 10 | 11 | from .base_tool import BaseTool 12 | from ..events import EventBus 13 | from ..events.event_bus_utils import get_open_file_set 14 | from ..types.tool_types import ToolResult 15 | from ..types.event_types import EventType, FileOperation, FileEvent 16 | from ..types.agent_types import AgentInterface 17 | 18 | logger = logging.getLogger(__name__) 19 | logger.setLevel(logging.INFO) 20 | 21 | 22 | class OpenFile(BaseTool): 23 | TOOL_NAME = "open_files" 24 | TOOL_DESCRIPTION = """A file viewer tool that allows you to see the contents of one or more files in your context window. 25 | 26 | Note, you should use /home/agent/workdir as your working directory if possible, althogh specifying file paths outside this directory will work. 27 | 28 | Features: 29 | - View multiple files at once 30 | - Optional line number display for easier reference 31 | - Automatic warning for non-text files 32 | 33 | VERY IMPORTANT NOTE: you should only open files that are in a plain text format (e.g. something that you might open in a code editor). Opening media files, binary formats or any other non-text format will lead to unpredictable results. 34 | """ 35 | 36 | file_paths: list[str] = Field( 37 | ..., 38 | description="A list of one or more absolute filepaths to add to open in your context window", 39 | ) 40 | show_line_numbers: bool = Field( 41 | False, 42 | description="When True, displays line numbers in the left margin of the file for easier reference.", 43 | ) 44 | 45 | def __init__(self, calling_agent: AgentInterface, **data): 46 | super().__init__(calling_agent=calling_agent, **data) 47 | 48 | async def run(self) -> ToolResult: 49 | try: 50 | output_strings = [] 51 | warnings = [] 52 | total_lines = 0 53 | for fpath in self.file_paths: 54 | path = Path(fpath) 55 | if not path.exists(): 56 | warnings.append(f"File path: {path} does not exist") 57 | continue 58 | 59 | output_strings.append(f"The file at {path} was opened successfully.") 60 | 61 | file_content = path.read_text() 62 | event = FileEvent( 63 | type=EventType.FILE_EVENT, 64 | content=file_content, 65 | path=str(fpath), 66 | operation=FileOperation.OPEN, 67 | metadata={"show_line_numbers": self.show_line_numbers}, 68 | ) 69 | total_lines += len(file_content.splitlines()) 70 | 71 | event_bus = await EventBus.get_instance() 72 | await event_bus.publish(event, self._calling_agent._id) 73 | 74 | if total_lines > 750: 75 | warnings.append(f"You have added {total_lines} of content to the context, which is quite high. If this file is not immediately relevant to the task at hand, you should make sure to close it (and any other long files) with the close_file tool.") 76 | 77 | return ToolResult( 78 | tool_name=self.TOOL_NAME, 79 | success=True, 80 | output="\n".join(output_strings) if output_strings else None, 81 | warnings="\n".join(warnings) if warnings else None, 82 | ) 83 | except Exception as e: 84 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e)) 85 | 86 | @classmethod 87 | def generate_examples(cls) -> list[tuple["OpenFile", ToolResult]]: 88 | from ..agents.implementations import DemoAgent 89 | 90 | return [ 91 | ( 92 | cls( 93 | calling_agent=DemoAgent(), 94 | file_paths=["/home/agent/workdir/example.txt"], 95 | show_line_numbers=False, 96 | ), 97 | ToolResult(tool_name=cls.TOOL_NAME, success=True), 98 | ), 99 | ( 100 | cls( 101 | calling_agent=DemoAgent(), 102 | file_paths=["/tmp/example.txt", "/home/agent/workdir/new.txt"], 103 | show_line_numbers=True, 104 | ), 105 | ToolResult(tool_name=cls.TOOL_NAME, success=True), 106 | ), 107 | ] 108 | 109 | 110 | class CloseFile(BaseTool): 111 | TOOL_NAME = "close_files" 112 | TOOL_DESCRIPTION = """Close one or more open files to clear up space in the context window. 113 | 114 | Note that you can call this tool with the empty list [] as the file_paths to close all open files. 115 | """ 116 | 117 | file_paths: list[str] = Field( 118 | ..., 119 | description="A list of one or more absolute file paths to close. If this is the empty list, then all files will be closed", 120 | ) 121 | 122 | def __init__(self, calling_agent: AgentInterface, **data): 123 | super().__init__(calling_agent=calling_agent, **data) 124 | 125 | async def run(self) -> ToolResult: 126 | try: 127 | event_bus = await EventBus.get_instance() 128 | 129 | if len(self.file_paths) == 0: 130 | open_files = await get_open_file_set(self._calling_agent._id) 131 | for open_event in open_files: 132 | close_event = FileEvent( 133 | type=EventType.FILE_EVENT, 134 | content="", 135 | path=open_event.path, 136 | operation=FileOperation.CLOSE, 137 | ) 138 | await event_bus.publish(close_event, self._calling_agent._id) 139 | return ToolResult( 140 | tool_name=self.TOOL_NAME, 141 | success=True, 142 | ) 143 | 144 | for fpath in self.file_paths: 145 | close_event = FileEvent( 146 | type=EventType.FILE_EVENT, 147 | content="", 148 | path=fpath, 149 | operation=FileOperation.CLOSE, 150 | ) 151 | await event_bus.publish(close_event, self._calling_agent._id) 152 | 153 | return ToolResult( 154 | tool_name=self.TOOL_NAME, 155 | success=True, 156 | ) 157 | except Exception as e: 158 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e)) 159 | 160 | @classmethod 161 | def generate_examples(cls) -> list[tuple["CloseFile", ToolResult]]: 162 | from ..agents.implementations import DemoAgent 163 | 164 | return [ 165 | ( 166 | cls( 167 | calling_agent=DemoAgent(), 168 | file_paths=["/home/agent/workdir/example.txt"], 169 | ), 170 | ToolResult(tool_name=cls.TOOL_NAME, success=True), 171 | ), 172 | ( 173 | cls( 174 | calling_agent=DemoAgent(), 175 | file_paths=[], 176 | ), 177 | ToolResult(tool_name=cls.TOOL_NAME, success=True), 178 | ), 179 | ] 180 | -------------------------------------------------------------------------------- /base_agent/src/tools/reasoning_structures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/tools/reasoning_structures/__init__.py -------------------------------------------------------------------------------- /base_agent/src/tools/reasoning_structures/coding.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | A reasoning structure for coding. 8 | """ 9 | import logging 10 | 11 | from pydantic import PrivateAttr 12 | 13 | from .sequential import Step, ToolBasedReasoningStructure, _make_id 14 | from ...types.tool_types import ToolResult 15 | from ...types.agent_types import AgentInterface 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.INFO) 19 | 20 | 21 | class CodingReasoningStructure(ToolBasedReasoningStructure): 22 | 23 | TOOL_NAME = "coding_reasoning_structure" 24 | TOOL_DESCRIPTION = """Apply this reasoning structure when you detect that you have a non-trivial coding implementation task that requires a methodical approach involving initial exploration, implementation, verification and cleanup to complete well. 25 | 26 | Do not call this tool if merely verifying, testing or if the task at hand is quick and does not require such rigour. 27 | 28 | This reasoning structure will guide you through good software engineering practices, and ensure that no steps have been missed out. 29 | """ 30 | _steps: list[Step] = PrivateAttr(default_factory=lambda: [ 31 | Step( 32 | identifier=_make_id(), 33 | instruction="Explore the project to a) locate all useful documentation (README.md files, common likely MD documentation files, etc), b) all files that may relate to your programming instructions, c) identify module-level and file-level design patterns and conventions.", 34 | done_description="You have viewed each of these files, made sure to close irrelevant or long files, and taken notes or summaries. Note that for greenfiled projects, this step may complete trivially.", 35 | failed_description="Files could not be opened for some reason, or the project location is unclear.", 36 | ), 37 | Step( 38 | identifier=_make_id(), 39 | instruction="Carefully implement the solution completely and thoroughly. Make sure you observe any existing stylistic conventions, and effectively re-use existing design patterns or modules to avoid duplicating functionality.", 40 | done_description="A first pass at the code implementation has been implemented, with tests not yet having been run.", 41 | failed_description="You have got stuck trying to get dependencies set up, getting mocks and fixtures set up, or have otherwise digressed from the core code implementation.", 42 | ), 43 | Step( 44 | identifier=_make_id(), 45 | instruction="Test the implementation end-to-end, favouring test scripts instead of test frameworks. If this is not an option or the project already has a test framework set up, then use that.", 46 | done_description="You have ensured that the code is valid, hasn't introduced any regressions and works as intended", 47 | failed_description="You have got stuck writing TDD loops, getting dependencies set up, getting mocks and fixtures set up", 48 | ), 49 | Step( 50 | identifier=_make_id(), 51 | instruction="Clean up: remove any temporary test scripts, toy implementations or other scaffolding. Check that all documentation and docstrings are up-to-date.", 52 | done_description="All temporary files have been removed, and documentation updated.", 53 | ), 54 | ]) 55 | 56 | def __init__(self, calling_agent: AgentInterface, **data): 57 | super().__init__(calling_agent=calling_agent, **data) 58 | 59 | @classmethod 60 | def generate_examples(cls) -> list[tuple["CodingReasoningStructure", ToolResult]]: 61 | from ...agents.implementations import DemoAgent 62 | 63 | return [ 64 | ( 65 | cls(calling_agent=DemoAgent()), 66 | ToolResult( 67 | tool_name=cls.TOOL_NAME, 68 | success=True, 69 | output="The first step in the meta improvement process is: ...", 70 | ), 71 | ), 72 | ] 73 | -------------------------------------------------------------------------------- /base_agent/src/tools/reasoning_structures/sequential_subagents.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | A proof of concept / template for a sub-agent based reasoning structure, where 8 | each step is a hard-coded sub-agent call. 9 | """ 10 | import logging 11 | 12 | from uuid import uuid4 13 | 14 | from ..base_tool import BaseTool 15 | from ...types.tool_types import ToolResult 16 | from ...types.agent_types import AgentInterface 17 | from ...types.llm_types import FCI, ToolCallContent 18 | from ...agents.implementations.coder import CodingAgent 19 | 20 | logger = logging.getLogger(__name__) 21 | logger.setLevel(logging.INFO) 22 | 23 | 24 | class SubagentBasedReasoningStructure(BaseTool): 25 | 26 | TOOL_NAME = "example_subagent_reasoning_structure" 27 | TOOL_DESCRIPTION = """Reason through a fixed list of points sequentially.""" 28 | 29 | def __init__(self, calling_agent: AgentInterface, **data): 30 | super().__init__(calling_agent=calling_agent, **data) 31 | 32 | async def run(self) -> ToolResult: 33 | parent_agent: AgentInterface = self._calling_agent 34 | 35 | try: 36 | await parent_agent._handle_agent_call(ToolCallContent( 37 | call_id=f"agent_{uuid4().hex[:8]}", 38 | tool_name=CodingAgent.AGENT_NAME, 39 | tool_args=dict( 40 | programming_instructions="Print 'a' in a file called 'a.txt'", 41 | ), 42 | call_type=FCI.UNCONSTRAINED, # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers. 43 | )) 44 | 45 | await parent_agent._handle_agent_call(ToolCallContent( 46 | call_id=f"agent_{uuid4().hex[:8]}", 47 | tool_name=CodingAgent.AGENT_NAME, 48 | tool_args=dict( 49 | programming_instructions="Print 'b' in a file called 'b.txt'", 50 | ), 51 | call_type=FCI.UNCONSTRAINED, # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers. 52 | )) 53 | 54 | await parent_agent._handle_agent_call(ToolCallContent( 55 | call_id=f"agent_{uuid4().hex[:8]}", 56 | tool_name=CodingAgent.AGENT_NAME, 57 | tool_args=dict( 58 | programming_instructions="Print 'c' in a file called 'c.txt'", 59 | ), 60 | call_type=FCI.UNCONSTRAINED, # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers. 61 | )) 62 | 63 | return ToolResult( 64 | tool_name=self.TOOL_NAME, 65 | success=True, 66 | output="Completed successfully" 67 | ) 68 | 69 | except Exception as e: 70 | return ToolResult( 71 | tool_name=self.TOOL_NAME, 72 | success=False, 73 | errors=f"Error in sequential reasoning: {e}" 74 | ) 75 | 76 | @classmethod 77 | def generate_examples(cls) -> list[tuple["SubagentBasedReasoningStructure", ToolResult]]: 78 | from ...agents.implementations import DemoAgent 79 | 80 | return [ 81 | ( 82 | cls(calling_agent=DemoAgent()), 83 | ToolResult( 84 | tool_name=cls.TOOL_NAME, 85 | success=True, 86 | output="Successfully did the ABC", 87 | ), 88 | ), 89 | ] 90 | -------------------------------------------------------------------------------- /base_agent/src/types/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/types/__init__.py -------------------------------------------------------------------------------- /base_agent/src/types/common.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from enum import Enum 7 | 8 | 9 | class ArgFormat(str, Enum): 10 | """Tool argument formats""" 11 | 12 | XML = "xml" 13 | JSON = "json" 14 | -------------------------------------------------------------------------------- /base_agent/src/types/event_types.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import hashlib 7 | 8 | from enum import Enum 9 | from pathlib import Path 10 | from datetime import datetime 11 | from dataclasses import field, dataclass 12 | 13 | 14 | class EventType(Enum): 15 | ASSISTANT_MESSAGE = "assistant_message" 16 | ASSISTANT_REASONING = "assistant_reasoning" 17 | TOOL_CALL = "tool_call" 18 | TOOL_RESULT = "tool_result" 19 | AGENT_CALL = "agent_call" 20 | AGENT_RESULT = "agent_result" 21 | CORE_PROMPT_UPDATE = "core_prompt_update" 22 | SYSTEM_PROMPT_UPDATE = "system_prompt_update" 23 | FILE_EVENT = "file_event" 24 | APPLICATION_ERROR = "application_error" 25 | APPLICATION_WARNING = "application_warning" 26 | PROBLEM_STATEMENT = "problem_statement" # initial problem statement 27 | EXTERNAL_MESSAGE = "external_message" # subsequent update messages 28 | OVERSEER_NOTIFICATION = "overseer_notification" 29 | OVERSEER_UPDATE = "overseer_update" # for debugging 30 | BUDGET_INFO = "budget_info" 31 | TIMEOUT = "timeout" 32 | COST_LIMIT = "cost_limit" 33 | 34 | 35 | @dataclass 36 | class Event: 37 | """Base class for all events in the stream""" 38 | 39 | type: EventType 40 | content: str 41 | metadata: dict = field(default_factory=dict) 42 | timestamp: datetime = field(default_factory=datetime.now) 43 | 44 | 45 | class FileOperation(Enum): 46 | OPEN = "open" 47 | CLOSE = "close" 48 | EDIT = "edit" 49 | 50 | 51 | @dataclass 52 | class FileEvent: 53 | """Special event for file operations""" 54 | 55 | type: EventType 56 | content: str # NOTE: this is the formatted content, not just the raw file content (e.g. with line numbers, content hash, lsp diagnostics, etc) 57 | operation: FileOperation 58 | path: str 59 | 60 | timestamp: datetime = field(default_factory=datetime.now) 61 | metadata: dict = field(default_factory=dict) # NOTE: unused 62 | 63 | mtime: float = field(default=0.0) 64 | content_hash: str = field(default="") 65 | diff: str | None = None 66 | # lsp_diagnosdics: list = field(default_factory=list) 67 | 68 | def __post_init__(self): 69 | """Compute hash on initialization if not provided""" 70 | if not self.content_hash and self.content: 71 | self.content_hash = hashlib.sha256(self.content.encode()).hexdigest() 72 | 73 | if self.mtime == 0.0: 74 | try: 75 | self.mtime = Path(self.path).stat().st_mtime 76 | except Exception: 77 | pass 78 | 79 | def __hash__(self): 80 | return hash((self.type, self.operation, self.path, self.content_hash)) 81 | 82 | def __eq__(self, other): 83 | if not isinstance(other, FileEvent): 84 | return False 85 | return ( 86 | self.type == other.type 87 | and self.operation == other.operation 88 | and self.path == other.path 89 | and self.content_hash == other.content_hash 90 | ) 91 | -------------------------------------------------------------------------------- /base_agent/src/types/tool_types.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import os 7 | 8 | from abc import ABC, abstractmethod 9 | from typing import Any, ClassVar 10 | from pydantic import BaseModel, Field 11 | from ..schemas import dumps 12 | from ..types.common import ArgFormat 13 | 14 | 15 | class ToolResult(BaseModel): 16 | """Represents the result of a tool execution.""" 17 | 18 | tool_name: str 19 | success: bool 20 | duration: float = 0.0 # on tool error paths, duration is often 0 21 | output: dict[str, Any] | str | None = None 22 | warnings: str | None = None 23 | errors: str | None = None 24 | invocation_id: str = Field(default_factory=lambda: os.urandom(4).hex()) 25 | 26 | def __str__(self): 27 | str_output = self.output if isinstance(self.output, str) else None 28 | if isinstance(self.output, dict): 29 | str_output = dumps(self.output, ArgFormat.XML, indent=2) 30 | 31 | tool_response_str = "" 32 | tool_response_str += ( 33 | f"\n{'SUCCESS' if self.success else 'FAILURE'}" 34 | ) 35 | if str_output is not None: 36 | tool_response_str += f"\n{str_output}" 37 | if self.warnings is not None: 38 | tool_response_str += f"\n{self.warnings}" 39 | if self.errors is not None: 40 | tool_response_str += f"\n{self.errors}" 41 | if self.duration is not None: 42 | tool_response_str += f"\n{self.duration:.3f}" 43 | tool_response_str += "\n" 44 | 45 | return tool_response_str 46 | 47 | def to_plain_string(self): 48 | str_output = self.output if isinstance(self.output, str) else None 49 | if isinstance(self.output, dict): 50 | str_output = dumps(self.output, ArgFormat.JSON, indent=2) 51 | 52 | tool_response_str = f"{self.tool_name} response:" 53 | tool_response_str += f"\nSuccess: {self.success}" 54 | if str_output is not None: 55 | tool_response_str += f"\nResult: {str_output}" 56 | if self.warnings is not None: 57 | tool_response_str += f"\nWarnings: {self.warnings}" 58 | if self.errors is not None: 59 | tool_response_str += f"\nErrors: {self.errors}" 60 | if self.duration is not None: 61 | tool_response_str += f"\nDuration: {self.duration:.3f}" 62 | 63 | return tool_response_str 64 | 65 | 66 | class ToolInterface(BaseModel, ABC): 67 | """Abstract interface for all tools""" 68 | 69 | # Class variables 70 | TOOL_NAME: ClassVar[str] 71 | TOOL_DESCRIPTION: ClassVar[str] 72 | EPHEMERAL: ClassVar[bool] = False 73 | 74 | class Config: 75 | extra = "forbid" 76 | 77 | @abstractmethod 78 | async def run(self) -> ToolResult: 79 | """Execute the tool's functionality""" 80 | pass 81 | 82 | @classmethod 83 | @abstractmethod 84 | def generate_examples(cls) -> list[tuple["BaseTool", ToolResult]]: 85 | """Generate example uses of the tool with their expected outputs""" 86 | pass 87 | 88 | @classmethod 89 | @abstractmethod 90 | def to_prompt_format(cls, arg_format: ArgFormat = ArgFormat.XML) -> str: 91 | """Convert the tool definition to XML format for the unconstrained tool use prompt.""" 92 | pass 93 | 94 | @classmethod 95 | @abstractmethod 96 | def to_plain_prompt_format(cls, arg_format: ArgFormat = ArgFormat.JSON) -> str: 97 | """Convert the tool definition to a formatted string for the constrained tool use prompt. 98 | 99 | NOTE: most providers use JSON-like syntax in their prompts, so 100 | generating few-shot examples like this tends to work better. 101 | """ 102 | pass 103 | -------------------------------------------------------------------------------- /base_agent/src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/utils/__init__.py -------------------------------------------------------------------------------- /base_agent/src/utils/metrics.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import random 7 | 8 | from datetime import datetime, timedelta 9 | 10 | from ..types.llm_types import TokenUsage 11 | from ..types.agent_types import AgentMetrics 12 | 13 | 14 | def make_random_agent_metrics( 15 | tools_enabled: bool = True, 16 | agents_enabled: bool = True, 17 | min_duration_seconds: int = 1, 18 | max_duration_seconds: int = 300, 19 | base_prompt_tokens: int = 500, 20 | token_variance: float = 0.3, 21 | cache_hit_rate: float = 0.4, 22 | cache_write_rate: float = 0.3, 23 | cost_per_1k_tokens: float = 0.002, 24 | seed: int = 42 # Added seed parameter 25 | ) -> AgentMetrics: 26 | """ 27 | Generate random but plausible agent metrics deterministically. 28 | 29 | Args: 30 | tools_enabled: Whether tools are enabled for this agent 31 | agents_enabled: Whether sub-agents are enabled for this agent 32 | min_duration_seconds: Minimum execution duration in seconds 33 | max_duration_seconds: Maximum execution duration in seconds 34 | base_prompt_tokens: Base number of prompt tokens to vary around 35 | token_variance: How much to vary token counts (as proportion of base) 36 | cache_hit_rate: Proportion of tokens that should be cached hits 37 | cache_write_rate: Proportion of uncached tokens that should be written to cache 38 | cost_per_1k_tokens: Cost per 1000 tokens in dollars 39 | seed: Random seed for deterministic output 40 | 41 | Returns: 42 | AgentMetrics object with randomized but plausible values 43 | """ 44 | # Set the random seed for reproducibility 45 | random.seed(seed) 46 | 47 | # Use a fixed base time instead of datetime.now() 48 | base_time = datetime(2025, 1, 1, 0, 0, 0) # Fixed starting point 49 | start_time = base_time - timedelta(days=random.randint(0, 7)) 50 | duration = random.uniform(min_duration_seconds, max_duration_seconds) 51 | end_time = start_time + timedelta(seconds=duration) 52 | 53 | # Calculate base token counts with some variance 54 | variance_factor = 1 + random.uniform(-token_variance, token_variance) 55 | total_prompt_tokens = int(base_prompt_tokens * variance_factor) 56 | 57 | # Calculate cached vs uncached split 58 | cached_tokens = int(total_prompt_tokens * cache_hit_rate) 59 | uncached_tokens = total_prompt_tokens - cached_tokens 60 | 61 | # Calculate cache writes 62 | cache_writes = int(uncached_tokens * cache_write_rate) 63 | 64 | # Generate completion tokens (typically 20-80% of prompt tokens) 65 | completion_tokens = int(total_prompt_tokens * random.uniform(0.2, 0.8)) 66 | 67 | # Calculate tool and agent calls if enabled 68 | tool_calls = 0 69 | agent_calls = 0 70 | 71 | if tools_enabled: 72 | # Typically 1-5 tool calls per interaction 73 | tool_calls = random.randint(1, 5) 74 | 75 | if agents_enabled: 76 | # Typically 0-3 agent calls per interaction 77 | agent_calls = random.randint(0, 3) 78 | 79 | # Calculate total cost 80 | total_tokens = total_prompt_tokens + completion_tokens 81 | cost = (total_tokens / 1000) * cost_per_1k_tokens 82 | 83 | return AgentMetrics( 84 | start_time=start_time, 85 | end_time=end_time, 86 | token_usage=TokenUsage( 87 | uncached_prompt_tokens=uncached_tokens - cache_writes, 88 | cache_write_prompt_tokens=cache_writes, 89 | cached_prompt_tokens=cached_tokens, 90 | completion_tokens=completion_tokens, 91 | ), 92 | cost=cost, 93 | tool_calls=tool_calls, 94 | agent_calls=agent_calls, 95 | ) 96 | -------------------------------------------------------------------------------- /base_agent/src/utils/parsing.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Some parsing utilities. 8 | 9 | This module provides utilities for parsing various types of data, 10 | particularly focusing on numerical parsing from strings. 11 | """ 12 | 13 | import re 14 | 15 | from typing import Optional, Literal 16 | 17 | def extract_before_last(text: str, pattern: str, keep_pattern: bool = False) -> str: 18 | last_pos = text.rfind(pattern) 19 | offset = len(pattern) if keep_pattern else 0 20 | return text[:last_pos + offset] if last_pos != -1 else "" 21 | 22 | def extract_after_last(text: str, pattern: str, keep_pattern: bool = False) -> str: 23 | last_pos = text.rfind(pattern) 24 | offset = 0 if keep_pattern else len(pattern) 25 | return text[last_pos + offset:] if last_pos != -1 else "" 26 | 27 | 28 | def extract_after_first(text: str, pattern: str, keep_pattern: bool = False) -> str: 29 | first_pos = text.find(pattern) 30 | offset = 0 if keep_pattern else len(pattern) 31 | return text[first_pos + offset:] if first_pos != -1 else "" 32 | 33 | 34 | def extract_between_patterns( 35 | s: str, 36 | pattern_a: str, 37 | pattern_b: str, 38 | a_occurrence: Literal["first"] | Literal["last"] = "first", 39 | b_occurrence: Literal["first"] | Literal["last"] = "last", 40 | ) -> str | None: 41 | # Validate both occurrences upfront 42 | if a_occurrence not in ("first", "last"): 43 | raise ValueError("Invalid value for a_occurrence. Use 'first' or 'last'.") 44 | if b_occurrence not in ("first", "last"): 45 | raise ValueError("Invalid value for b_occurrence. Use 'first' or 'last'.") 46 | 47 | # Determine the index for `pattern_a` 48 | if a_occurrence == "first": 49 | start_index = s.find(pattern_a) 50 | else: # "last" 51 | start_index = s.rfind(pattern_a) 52 | 53 | if start_index == -1: 54 | return None 55 | 56 | start_index += len(pattern_a) 57 | 58 | # Determine the index for `pattern_b` 59 | if b_occurrence == "first": 60 | end_index = s.find(pattern_b) 61 | else: # "last" 62 | end_index = s.rfind(pattern_b) 63 | 64 | if end_index == -1 or end_index <= start_index: 65 | return None 66 | 67 | return s[start_index:end_index] 68 | 69 | 70 | def parse_number_from_string( 71 | answer: str, 72 | ) -> tuple[bool, Optional[float], Optional[str]]: 73 | cleaned = answer.strip().replace(",", "") 74 | 75 | # Pattern for a valid number segment 76 | number_pattern = r"-?\d*\.?\d+(?:[eE][-+]?\d+)?" 77 | match = re.search(number_pattern, cleaned) 78 | 79 | if not match: 80 | return ( 81 | False, 82 | None, 83 | "Could not find a number in the answer. Please provide a clear numerical response.", 84 | ) 85 | 86 | matched_str = match.group() 87 | # Check for multiple decimal points in the matched string 88 | if matched_str.count(".") > 1: 89 | return ( 90 | False, 91 | None, 92 | "Found what looks like a number but couldn't parse it: too many decimal points", 93 | ) 94 | 95 | try: 96 | value = float(matched_str) 97 | full_match = matched_str == cleaned 98 | if not full_match: 99 | return ( 100 | True, 101 | value, 102 | "Warning: Found additional text around the number. In future, try to provide just the number.", 103 | ) 104 | return True, value, None 105 | except ValueError as e: 106 | return ( 107 | False, 108 | None, 109 | f"Found what looks like a number but couldn't parse it: {str(e)}", 110 | ) 111 | -------------------------------------------------------------------------------- /base_agent/src/utils/stop_tokens.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # WARNING: while you can read this file, however editing this file directly 7 | # will stop your generation and you will stop abruptly and fail! 8 | # 9 | # If you want to add a new stop token for the next agent itertion, then you 10 | # should append it to this file using a terminal tool like: 11 | # echo 'NEW_STOP_TOKEN = ""' >> tools/stop_tokens.py 12 | # 13 | # If you want to remove one, then make a line edit using something like: 14 | # sed -i 'd' tools/stop_tokens.py. 15 | # Note that the first token, TOOL_STOP_TOKEN, is on line 14 of this file after 16 | # this comment is counted. To delete it, you'd do: 17 | # sed -i '14d' tools/stop_tokens.py. 18 | 19 | TOOL_STOP_TOKEN = "" 20 | AGENT_STOP_TOKEN = "" 21 | OVERSEER_STOP_TOKEN = "" 22 | -------------------------------------------------------------------------------- /base_agent/src/web_server/__init__.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Web server package for callgraph visualization.""" 7 | 8 | from .server import run_server 9 | 10 | __all__ = ["run_server"] 11 | -------------------------------------------------------------------------------- /base_agent/src/web_server/static/components/metrics-display.js: -------------------------------------------------------------------------------- 1 | /** 2 | * MetricsDisplay component 3 | */ 4 | 5 | import { Component } from "../core.js"; 6 | import { formatters } from "../utils/formatters.js"; 7 | import { store } from "../store.js"; 8 | 9 | export class MetricsDisplay extends Component { 10 | constructor() { 11 | super(); 12 | this.attachShadow({ mode: "open" }); 13 | 14 | // Add styles 15 | const style = document.createElement("style"); 16 | style.textContent = ` 17 | :host { 18 | display: flex; 19 | flex-wrap: wrap; 20 | align-items: center; 21 | color: white; 22 | } 23 | .metric { 24 | display: flex; 25 | align-items: center; 26 | margin-right: 1.5rem; 27 | } 28 | .label { 29 | font-size: 0.75rem; 30 | text-transform: uppercase; 31 | letter-spacing: 0.05em; 32 | color: #d1d5db; 33 | } 34 | .value { 35 | margin-left: 0.5rem; 36 | font-size: 0.875rem; 37 | font-weight: 500; 38 | } 39 | .cached { 40 | font-size: 0.75rem; 41 | color: #d1d5db; 42 | } 43 | `; 44 | this.shadowRoot.appendChild(style); 45 | 46 | // Create container 47 | this.container = document.createElement("div"); 48 | this.container.style.display = "flex"; 49 | this.container.style.flexWrap = "wrap"; 50 | this.container.style.alignItems = "center"; 51 | this.shadowRoot.appendChild(this.container); 52 | 53 | // Listen for state changes 54 | document.addEventListener("state-change", (e) => { 55 | if (e.detail.property === "callgraphData") { 56 | this.setState({ data: e.detail.value }); 57 | } 58 | }); 59 | } 60 | 61 | render() { 62 | const data = this.state.data || {}; 63 | 64 | this.container.innerHTML = ` 65 |
66 | Duration 67 | ${formatters.duration(data.total_duration)} 68 |
69 |
70 | Total Tokens 71 | ${formatters.tokens(data.total_tokens)} 72 | ${data.total_tokens ? formatters.cachePercent(data.num_cached_tokens, data.total_tokens) : "-"} 73 |
74 |
75 | Cost 76 | ${formatters.cost(data.total_cost)} 77 |
78 | `; 79 | } 80 | } 81 | 82 | customElements.define("metrics-display", MetricsDisplay); 83 | -------------------------------------------------------------------------------- /base_agent/src/web_server/static/core.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Core reactive system for the visualization 3 | */ 4 | 5 | // Base Component class 6 | export class Component extends HTMLElement { 7 | constructor() { 8 | super(); 9 | this.state = new Proxy( 10 | {}, 11 | { 12 | set: (target, property, value) => { 13 | target[property] = value; 14 | this.render(); 15 | return true; 16 | }, 17 | }, 18 | ); 19 | } 20 | 21 | setState(newState) { 22 | Object.assign(this.state, newState); 23 | } 24 | 25 | render() { 26 | // Override in subclasses 27 | } 28 | 29 | connectedCallback() { 30 | this.render(); 31 | } 32 | } 33 | 34 | // HTML escaping utility 35 | export function escapeHtml(unsafe) { 36 | return unsafe 37 | .replace(/&/g, "&") 38 | .replace(//g, ">") 40 | .replace(/"/g, """) 41 | .replace(/'/g, "'"); 42 | } 43 | -------------------------------------------------------------------------------- /base_agent/src/web_server/static/store.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Centralized state management with WebSocket support 3 | */ 4 | 5 | export const store = new Proxy( 6 | { 7 | callgraphData: null, 8 | }, 9 | { 10 | set(target, property, value) { 11 | target[property] = value; 12 | document.dispatchEvent( 13 | new CustomEvent("state-change", { 14 | detail: { property, value }, 15 | }) 16 | ); 17 | return true; 18 | }, 19 | } 20 | ); 21 | 22 | let socket; 23 | 24 | export async function updateVisualization() { 25 | try { 26 | const response = await fetch("/api/callgraph"); 27 | const data = await response.json(); 28 | 29 | // Skip if data hasn't changed 30 | if (JSON.stringify(data) !== JSON.stringify(store.callgraphData)) { 31 | store.callgraphData = data; 32 | } 33 | } catch (error) { 34 | console.error("Error updating visualization:", error); 35 | } 36 | } 37 | 38 | function connectWebSocket() { 39 | socket = new WebSocket(`ws://${window.location.host}/ws`); 40 | 41 | socket.onopen = () => { 42 | console.log("WebSocket connected"); 43 | }; 44 | 45 | socket.onmessage = (event) => { 46 | const message = JSON.parse(event.data); 47 | if (message.type === 'event') { 48 | // Get latest data to incorporate the new event 49 | updateVisualization(); 50 | } 51 | }; 52 | 53 | socket.onclose = () => { 54 | console.log("WebSocket disconnected. Reconnecting..."); 55 | setTimeout(connectWebSocket, 1000); 56 | }; 57 | 58 | socket.onerror = (error) => { 59 | console.error("WebSocket error:", error); 60 | }; 61 | } 62 | 63 | // Start WebSocket connection and initial data load 64 | export function startUpdates() { 65 | updateVisualization(); // Initial load 66 | connectWebSocket(); // Real-time updates 67 | } -------------------------------------------------------------------------------- /base_agent/src/web_server/static/styles.css: -------------------------------------------------------------------------------- 1 | /* Base styles */ 2 | body { 3 | font-family: "Inter", sans-serif; 4 | color: #1f2937; 5 | } 6 | 7 | /* Global styles */ 8 | .execution-tree, 9 | #event-stream { 10 | font-family: "JetBrains Mono", monospace; 11 | font-size: 13px; 12 | line-height: 1.3; 13 | } 14 | 15 | /* Header styles */ 16 | .header { 17 | background: linear-gradient(90deg, #1e293b 0%, #334155 100%); 18 | } 19 | 20 | /* Tree visualization styles */ 21 | .execution-tree { 22 | position: relative; 23 | } 24 | 25 | .execution-tree .node { 26 | margin-bottom: 0.25rem; 27 | position: relative; 28 | } 29 | 30 | .execution-tree .node-content { 31 | margin-left: 1.25rem; 32 | position: relative; 33 | } 34 | 35 | /* Vertical line for tree structure */ 36 | .execution-tree .node-content::before { 37 | content: ""; 38 | position: absolute; 39 | left: -12px; 40 | top: 0; 41 | bottom: 0; 42 | width: 2px; 43 | background-color: #e2e8f0; 44 | } 45 | 46 | /* Hover effect for collapsible areas */ 47 | .execution-tree .node-content:hover::before { 48 | background-color: #93c5fd; 49 | } 50 | 51 | /* Reduce vertical space */ 52 | .execution-tree .event-entry, 53 | #event-stream .event { 54 | padding-top: 0.125rem; 55 | padding-bottom: 0.125rem; 56 | } 57 | 58 | /* Event line styles */ 59 | .event-line { 60 | position: relative; 61 | } 62 | 63 | /* Event stream styles */ 64 | #event-stream .event { 65 | margin-bottom: 1rem; 66 | border-radius: 0.25rem; 67 | overflow: hidden; 68 | box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1); 69 | } 70 | 71 | #event-stream .event-content, 72 | #event-stream .event-full-content { 73 | background-color: #f8fafc; 74 | transition: background-color 0.2s; 75 | } 76 | 77 | #event-stream .event-content:hover, 78 | #event-stream .event-full-content:hover { 79 | background-color: #f1f5f9; 80 | } 81 | 82 | /* Execution tree hover styles */ 83 | .execution-tree .cursor-pointer { 84 | transition: background-color 0.2s; 85 | } 86 | 87 | .execution-tree .cursor-pointer:hover { 88 | background-color: #eff6ff; 89 | } 90 | 91 | /* Status indicators */ 92 | .status-indicator { 93 | display: inline-block; 94 | width: 10px; 95 | height: 10px; 96 | border-radius: 50%; 97 | margin-right: 0.5rem; 98 | } 99 | 100 | .status-pending { 101 | background-color: #fbbf24; 102 | } 103 | 104 | .status-running { 105 | background-color: #60a5fa; 106 | animation: pulse 2s infinite; 107 | } 108 | 109 | .status-success { 110 | background-color: #34d399; 111 | } 112 | 113 | .status-failed { 114 | background-color: #f87171; 115 | } 116 | 117 | @keyframes pulse { 118 | 0% { 119 | opacity: 1; 120 | } 121 | 50% { 122 | opacity: 0.6; 123 | } 124 | 100% { 125 | opacity: 1; 126 | } 127 | } 128 | 129 | /* Animation */ 130 | @keyframes highlight { 131 | 0% { 132 | background-color: rgba(59, 130, 246, 0.1); 133 | } 134 | 50% { 135 | background-color: rgba(59, 130, 246, 0.1); 136 | } 137 | 100% { 138 | background-color: transparent; 139 | } 140 | } 141 | 142 | .event-highlight { 143 | animation: highlight 2s ease-in-out; 144 | } 145 | 146 | /* Scrollbar styles */ 147 | ::-webkit-scrollbar { 148 | width: 8px; 149 | height: 8px; 150 | } 151 | 152 | ::-webkit-scrollbar-track { 153 | background: #f1f5f9; 154 | border-radius: 4px; 155 | } 156 | 157 | ::-webkit-scrollbar-thumb { 158 | background: #cbd5e1; 159 | border-radius: 4px; 160 | } 161 | 162 | ::-webkit-scrollbar-thumb:hover { 163 | background: #94a3b8; 164 | } 165 | 166 | /* Utility classes */ 167 | .truncate { 168 | white-space: nowrap; 169 | overflow: hidden; 170 | text-overflow: ellipsis; 171 | } 172 | -------------------------------------------------------------------------------- /base_agent/src/web_server/static/utils/event-utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Event-related utilities 3 | */ 4 | 5 | // Event type mapping for badges 6 | export function getEventBadge(type) { 7 | const badges = { 8 | assistant_message: "assistant", 9 | tool_call: "tool", 10 | tool_result: "tool", 11 | agent_call: "agent", 12 | agent_result: "agent", 13 | overseer_notification: "overseer", 14 | system_prompt_update: "system", 15 | core_prompt_update: "system", 16 | default: "system", 17 | }; 18 | return badges[type] || badges.default; 19 | } 20 | 21 | // Get node status indicator 22 | export function getStatusIndicator(node) { 23 | if (!node.started_at) { 24 | return { class: "status-pending", label: "Pending" }; 25 | } 26 | if (!node.completed_at) { 27 | return { class: "status-running", label: "Running" }; 28 | } 29 | return node.success 30 | ? { class: "status-success", label: "Success" } 31 | : { class: "status-failed", label: "Failed" }; 32 | } 33 | 34 | // Creates a chronological event stream from all events across all nodes 35 | export function createChronologicalEventStream(nodes) { 36 | const allEvents = []; 37 | Object.entries(nodes).forEach(([nodeId, node]) => { 38 | if (node.events) { 39 | allEvents.push( 40 | ...node.events.map((event) => ({ 41 | nodeId, 42 | nodeName: node.name, 43 | event, 44 | time: new Date(event.timestamp), 45 | })), 46 | ); 47 | } 48 | }); 49 | return allEvents.sort((a, b) => a.time - b.time); 50 | } 51 | 52 | // Sort events while maintaining agent call sequence 53 | export function sortNodeEvents(events) { 54 | const sortedEvents = []; 55 | const tempEvents = [...events].sort( 56 | (a, b) => new Date(a.timestamp) - new Date(b.timestamp), 57 | ); 58 | 59 | let i = 0; 60 | while (i < tempEvents.length) { 61 | const event = tempEvents[i]; 62 | sortedEvents.push(event); 63 | i++; 64 | 65 | if (event.type === "agent_call") { 66 | const callTime = new Date(event.timestamp); 67 | const agentEvents = []; 68 | let j = i; 69 | let foundResult = false; 70 | while (j < tempEvents.length && !foundResult) { 71 | const nextEvent = tempEvents[j]; 72 | if ( 73 | nextEvent.type === "agent_result" && 74 | new Date(nextEvent.timestamp) > callTime 75 | ) { 76 | agentEvents.push(nextEvent); 77 | tempEvents.splice(j, 1); 78 | foundResult = true; 79 | continue; 80 | } 81 | tempEvents.splice(j, 1); 82 | agentEvents.push(nextEvent); 83 | } 84 | sortedEvents.push(...agentEvents); 85 | } 86 | } 87 | 88 | return sortedEvents; 89 | } 90 | 91 | // UI interaction functions 92 | export function toggleContent(index) { 93 | // Get event-stream component 94 | const eventStream = document.querySelector("event-stream"); 95 | if (eventStream && eventStream.shadowRoot) { 96 | const truncated = eventStream.shadowRoot.querySelector( 97 | `#event-${index} .event-content`, 98 | ); 99 | const full = eventStream.shadowRoot.querySelector(`#event-full-${index}`); 100 | if (truncated && full) { 101 | if (truncated.classList.contains("hidden")) { 102 | truncated.classList.remove("hidden"); 103 | full.classList.add("hidden"); 104 | } else { 105 | truncated.classList.add("hidden"); 106 | full.classList.remove("hidden"); 107 | } 108 | } 109 | } 110 | } 111 | 112 | export function scrollToTop() { 113 | window.scrollTo({ top: 0, behavior: "smooth" }); 114 | } 115 | 116 | export function scrollToStreamEvent(index) { 117 | // Get event-stream component 118 | const eventStream = document.querySelector("event-stream"); 119 | if (eventStream && eventStream.shadowRoot) { 120 | const streamEvent = eventStream.shadowRoot.querySelector(`#event-${index}`); 121 | if (streamEvent) { 122 | streamEvent.scrollIntoView({ behavior: "smooth", block: "center" }); 123 | streamEvent.classList.add("event-highlight"); 124 | setTimeout(() => streamEvent.classList.remove("event-highlight"), 2000); 125 | 126 | // Expand the event details if needed 127 | const truncated = streamEvent.querySelector(`.event-content`); 128 | const full = streamEvent.querySelector(`#event-full-${index}`); 129 | if (truncated && full && truncated.classList.contains("hidden")) { 130 | toggleContent(index); 131 | } 132 | } 133 | } 134 | } 135 | 136 | export function toggleNode(nodeId) { 137 | const content = document.querySelector(`#${nodeId}-content`); 138 | if (content) { 139 | content.classList.toggle("hidden"); 140 | } 141 | } 142 | 143 | // Expose required functions to window object for global access 144 | window.scrollToStreamEvent = scrollToStreamEvent; 145 | window.scrollToTop = scrollToTop; 146 | window.toggleContent = toggleContent; 147 | -------------------------------------------------------------------------------- /base_agent/src/web_server/static/utils/formatters.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Formatting utilities 3 | */ 4 | 5 | export const formatters = { 6 | duration: (s) => 7 | !s 8 | ? "0s" 9 | : s < 60 10 | ? `${s.toFixed(1)}s` 11 | : `${Math.floor(s / 60)}m ${(s % 60).toFixed(1)}s`, 12 | tokens: (t) => 13 | !t 14 | ? "0" 15 | : t < 1000 16 | ? `${t}` 17 | : t < 1000000 18 | ? `${(t / 1000).toFixed(1)}K` 19 | : `${(t / 1000000).toFixed(1)}M`, 20 | cost: (c) => 21 | !c 22 | ? "$0.00" 23 | : c < 0.01 24 | ? `$${c.toFixed(5)}` 25 | : c < 0.1 26 | ? `$${c.toFixed(4)}` 27 | : c < 1 28 | ? `$${c.toFixed(3)}` 29 | : `$${c.toFixed(2)}`, 30 | cachePercent: (cached, total) => 31 | !total ? "0%" : `${((cached / total) * 100).toFixed(1)}% cached`, 32 | }; 33 | 34 | // Get total tokens from usage object 35 | export function getTotalTokens(usage) { 36 | if (!usage) return 0; 37 | return ( 38 | (usage.uncached_prompt_tokens || 0) + 39 | (usage.cache_write_prompt_tokens || 0) + 40 | (usage.cached_prompt_tokens || 0) + 41 | (usage.completion_tokens || 0) 42 | ); 43 | } 44 | -------------------------------------------------------------------------------- /base_agent/src/web_server/static/visualizer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Main visualization entry point 3 | */ 4 | 5 | import { startUpdates } from "./store.js"; 6 | import "./components/execution-tree.js"; 7 | import "./components/event-stream.js"; 8 | import "./components/metrics-display.js"; 9 | import { 10 | toggleContent, 11 | toggleNode, 12 | scrollToTop, 13 | scrollToStreamEvent, 14 | } from "./utils/event-utils.js"; 15 | 16 | // Make UI functions globally available 17 | window.toggleContent = toggleContent; 18 | window.toggleNode = toggleNode; 19 | window.scrollToTop = scrollToTop; 20 | window.scrollToStreamEvent = scrollToStreamEvent; 21 | 22 | // Start updates 23 | startUpdates(); 24 | -------------------------------------------------------------------------------- /base_agent/src/web_server/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Agent Execution 5 | 6 | 10 | 11 | 12 | 13 | 14 |
15 | 16 |
17 |
18 |
21 |

Agent Execution

22 | 23 |
24 |
25 |
26 | 27 | 28 |
29 |
30 | 31 |
32 |
33 |

Execution Tree

34 |
35 | 36 |
37 | 38 | 39 |
40 |
41 |

Event Stream

42 |
43 | 44 |
45 |
46 |
47 |
48 | 49 | 50 | -------------------------------------------------------------------------------- /base_agent/tests/benchmarks/test_gsm8k_benchmark.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Tests for the GSM8K benchmark implementation.""" 7 | import pytest 8 | import tempfile 9 | import os 10 | from pathlib import Path 11 | from unittest.mock import patch, MagicMock 12 | 13 | from src.benchmarks.gsm8k import GSM8KBenchmark, GSM8KExample 14 | 15 | 16 | class TestGSM8KExample: 17 | """Tests for the GSM8KExample class.""" 18 | 19 | def test_from_raw(self): 20 | """Test conversion from raw dataset example.""" 21 | # Create a mock raw example 22 | raw_example = { 23 | "question": "John has 5 apples. He buys 2 more. How many does he have now?", 24 | "answer": "John has 5 apples initially.\nHe buys 2 more apples.\nSo he has 5 + 2 = <<5+2=7>> apples in total.\n#### 7" 25 | } 26 | 27 | example = GSM8KExample.from_raw(raw_example) 28 | 29 | assert example.answer == raw_example["answer"] 30 | assert example.steps == [ 31 | "John has 5 apples initially.", 32 | "He buys 2 more apples.", 33 | "So he has 5 + 2 = <<5+2=7>> apples in total." 34 | ] 35 | assert example.final_answer == 7.0 36 | 37 | def test_extract_calculations(self): 38 | """Test extraction of calculations from solution steps.""" 39 | raw_example = { 40 | "question": "Calculation test", 41 | "answer": "Step 1: Calculate 2 + 3 = <<2+3=5>>\nStep 2: Multiply by 4: 5 × 4 = <<5*4=20>>\n#### 20" 42 | } 43 | 44 | example = GSM8KExample.from_raw(raw_example) 45 | calculations = example.extract_calculations() 46 | 47 | assert len(calculations) == 2 48 | 49 | # First calculation 50 | expr1, expected1, actual1 = calculations[0] 51 | assert expr1 == "2+3" 52 | assert expected1 == 5 53 | assert actual1 == 5 54 | 55 | # Second calculation 56 | expr2, expected2, actual2 = calculations[1] 57 | assert expr2 == "5*4" 58 | assert expected2 == 20 59 | assert actual2 == 20 60 | 61 | 62 | @pytest.mark.parametrize("subset_size", [None, 5, 10]) 63 | def test_benchmark_initialization(subset_size): 64 | """Test initializing the GSM8K benchmark with various subset sizes.""" 65 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset: 66 | # Mock the dataset loading 67 | mock_dataset = MagicMock() 68 | mock_dataset.__getitem__.return_value = [ 69 | {"question": f"Question {i}", "answer": f"Some steps\n#### {i}"} 70 | for i in range(1, 21) # Create 20 mock examples 71 | ] 72 | mock_load_dataset.return_value = mock_dataset 73 | 74 | benchmark = GSM8KBenchmark(seed=42, subset_size=subset_size) 75 | 76 | # Check benchmark properties 77 | assert benchmark.name == "gsm8k" 78 | 79 | # Verify subset_size is respected 80 | if subset_size: 81 | assert len(benchmark.problems) == subset_size 82 | else: 83 | assert len(benchmark.problems) == 20 # All examples 84 | 85 | # Verify problems have the expected structure 86 | for problem in benchmark.problems: 87 | assert isinstance(problem.statement, str) 88 | assert isinstance(problem.problem_id, str) # Just check it's a string 89 | assert isinstance(problem.answer, float) 90 | assert isinstance(problem.answer_discussion, str) 91 | 92 | 93 | @pytest.mark.asyncio 94 | async def test_score_problem_correct(): 95 | """Test scoring a correct GSM8K answer.""" 96 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset: 97 | # Mock the dataset loading 98 | mock_dataset = MagicMock() 99 | mock_dataset.__getitem__.return_value = [ 100 | {"question": "Question 1", "answer": "Some steps\n#### 42"} 101 | ] 102 | mock_load_dataset.return_value = mock_dataset 103 | 104 | benchmark = GSM8KBenchmark(seed=42, subset_size=1) 105 | problem = benchmark.problems[0] 106 | 107 | # Create a temporary directory for the answer 108 | with tempfile.TemporaryDirectory() as tmp_dir: 109 | answer_dir = Path(tmp_dir) 110 | 111 | # Create answer.txt with the correct answer 112 | answer_file = answer_dir / "answer.txt" 113 | answer_file.write_text("42") 114 | 115 | # Score the answer 116 | score, errors, discussion = await benchmark.score_problem( 117 | problem=problem, 118 | agent_workdir="/fake/workdir", 119 | agent_answer_dir=str(answer_dir), 120 | container_name="fake_container" 121 | ) 122 | 123 | # Verify the scoring 124 | assert score == 1.0 125 | assert errors is None 126 | assert discussion is not None 127 | 128 | 129 | @pytest.mark.asyncio 130 | async def test_score_problem_incorrect(): 131 | """Test scoring an incorrect GSM8K answer.""" 132 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset: 133 | # Mock the dataset loading 134 | mock_dataset = MagicMock() 135 | mock_dataset.__getitem__.return_value = [ 136 | {"question": "Question 1", "answer": "Some steps\n#### 42"} 137 | ] 138 | mock_load_dataset.return_value = mock_dataset 139 | 140 | benchmark = GSM8KBenchmark(seed=42, subset_size=1) 141 | problem = benchmark.problems[0] 142 | 143 | # Create a temporary directory for the answer 144 | with tempfile.TemporaryDirectory() as tmp_dir: 145 | answer_dir = Path(tmp_dir) 146 | 147 | # Create answer.txt with an incorrect answer 148 | answer_file = answer_dir / "answer.txt" 149 | answer_file.write_text("43") 150 | 151 | # Score the answer 152 | score, errors, discussion = await benchmark.score_problem( 153 | problem=problem, 154 | agent_workdir="/fake/workdir", 155 | agent_answer_dir=str(answer_dir), 156 | container_name="fake_container" 157 | ) 158 | 159 | # Verify the scoring 160 | assert score == 0.0 161 | assert errors is None 162 | assert discussion is not None 163 | 164 | 165 | @pytest.mark.asyncio 166 | async def test_score_problem_invalid_format(): 167 | """Test scoring a GSM8K answer with invalid format.""" 168 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset: 169 | # Mock the dataset loading 170 | mock_dataset = MagicMock() 171 | mock_dataset.__getitem__.return_value = [ 172 | {"question": "Question 1", "answer": "Some steps\n#### 42"} 173 | ] 174 | mock_load_dataset.return_value = mock_dataset 175 | 176 | benchmark = GSM8KBenchmark(seed=42, subset_size=1) 177 | problem = benchmark.problems[0] 178 | 179 | # Create a temporary directory for the answer 180 | with tempfile.TemporaryDirectory() as tmp_dir: 181 | answer_dir = Path(tmp_dir) 182 | 183 | # Create answer.txt with an incorrectly formatted answer 184 | answer_file = answer_dir / "answer.txt" 185 | answer_file.write_text("The answer is forty-two") 186 | 187 | # Score the answer 188 | score, errors, discussion = await benchmark.score_problem( 189 | problem=problem, 190 | agent_workdir="/fake/workdir", 191 | agent_answer_dir=str(answer_dir), 192 | container_name="fake_container" 193 | ) 194 | 195 | # Verify the scoring 196 | assert score == 0.0 197 | assert errors is not None # Should have parsing errors 198 | assert "could not convert string to float" in errors.lower() or "invalid literal" in errors.lower() 199 | -------------------------------------------------------------------------------- /base_agent/tests/test_example.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from src.types.llm_types import Model 7 | 8 | def test_example(): 9 | assert True 10 | assert isinstance(Model.SONNET_35.id, str) 11 | -------------------------------------------------------------------------------- /base_agent/tests/tools/reasoning_structures/test_sequential.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Tests for the Sequential reasoning structure tool.""" 7 | import pytest 8 | from unittest.mock import patch, AsyncMock 9 | 10 | from src.tools.reasoning_structures.sequential import ( 11 | ToolBasedReasoningStructure, Step, InvocationState, _make_id 12 | ) 13 | from src.agents.implementations import DemoAgent 14 | from src.types.tool_types import ToolResult 15 | 16 | 17 | # Do not use global pytestmark 18 | # Apply asyncio marker only to functions that need it 19 | @pytest.mark.asyncio 20 | async def test_initialization(): 21 | """Test proper initialization of the reasoning structure.""" 22 | structure = ToolBasedReasoningStructure(calling_agent=DemoAgent()) 23 | 24 | # Verify basic properties 25 | assert structure.TOOL_NAME == "example_reasoning_structure" 26 | assert hasattr(structure, "_steps") 27 | assert len(structure._steps) > 0 28 | assert all(isinstance(step, Step) for step in structure._steps) 29 | 30 | 31 | @pytest.mark.asyncio 32 | async def test_run_initializes_state(): 33 | """Test that run() correctly initializes state.""" 34 | agent = DemoAgent() 35 | structure = ToolBasedReasoningStructure(calling_agent=agent) 36 | 37 | # Run the reasoning structure 38 | result = await structure.run() 39 | 40 | # Verify state initialization 41 | assert len(agent._local_state) == 1 42 | 43 | invocation_id = next(iter(agent._local_state.keys())) 44 | invocation = agent._local_state[invocation_id] 45 | 46 | assert isinstance(invocation, InvocationState) 47 | assert invocation.steps == structure._steps 48 | assert invocation.current_step_id == structure._steps[0].identifier 49 | assert invocation.current_step_complete_tool is not None 50 | 51 | 52 | @pytest.mark.asyncio 53 | async def test_run_registers_completion_tool(): 54 | """Test that run() registers a completion tool for the first step.""" 55 | # Create an empty mock registry 56 | mock_registry = {} 57 | 58 | # Apply the patch within the test 59 | with patch("src.tools.reasoning_structures.sequential.tool_registry", mock_registry): 60 | agent = DemoAgent() 61 | structure = ToolBasedReasoningStructure(calling_agent=agent) 62 | 63 | # Run the reasoning structure 64 | await structure.run() 65 | 66 | # Verify a tool was registered 67 | assert len(mock_registry) == 1 68 | 69 | # Get the registered tool 70 | tool_name = next(iter(mock_registry.keys())) 71 | 72 | # Verify it's a completion tool 73 | assert tool_name.endswith("_complete") 74 | assert mock_registry[tool_name] in agent._available_tools 75 | 76 | 77 | @pytest.mark.asyncio 78 | async def test_run_returns_correct_result(): 79 | """Test that run() returns the expected result structure.""" 80 | structure = ToolBasedReasoningStructure(calling_agent=DemoAgent()) 81 | 82 | # Run the reasoning structure 83 | result = await structure.run() 84 | 85 | # Verify result properties 86 | assert isinstance(result, ToolResult) 87 | assert result.tool_name == structure.TOOL_NAME 88 | assert result.success is True 89 | assert "step id" in result.output.lower() 90 | assert "step instructions" in result.output.lower() 91 | 92 | 93 | @pytest.mark.asyncio 94 | async def test_step_completion_tool_creation(): 95 | """Test the creation of step completion tools.""" 96 | # Setup a mock for create_step_tool 97 | with patch("src.tools.reasoning_structures.sequential.create_step_tool") as mock_create_step_tool: 98 | # Setup mock return value 99 | mock_tool_cls = AsyncMock() 100 | mock_create_step_tool.return_value = mock_tool_cls 101 | 102 | # Create and run structure 103 | structure = ToolBasedReasoningStructure(calling_agent=DemoAgent()) 104 | await structure.run() 105 | 106 | # Verify tool creation 107 | mock_create_step_tool.assert_called_once() 108 | 109 | # Check arguments 110 | args = mock_create_step_tool.call_args[0] 111 | assert isinstance(args[0], str) # invocation_id 112 | assert isinstance(args[1], Step) # step 113 | 114 | # No asyncio marker for this function since it's synchronous 115 | def test_step_creation_utility(): 116 | """Test the utility function for creating step identifiers.""" 117 | # Generate IDs with custom prefix 118 | ids = [_make_id("test_prefix") for _ in range(5)] 119 | 120 | # Verify uniqueness 121 | assert len(ids) == len(set(ids)) 122 | 123 | # Verify format 124 | for id in ids: 125 | assert id.startswith("test_prefix_") 126 | assert len(id) > len("test_prefix_") 127 | 128 | # Verify default prefix works 129 | default_id = _make_id() 130 | assert default_id.startswith("step_") 131 | -------------------------------------------------------------------------------- /base_agent/tests/tools/test_base_tool.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """Tests for the BaseTool class functionality.""" 7 | import pytest 8 | from unittest.mock import Mock, patch 9 | import asyncio 10 | from typing import Optional 11 | 12 | # Fix the import paths to work when running from the base_agent directory 13 | from src.tools.base_tool import BaseTool, tool_registry 14 | from src.types.tool_types import ToolResult 15 | from src.types.agent_types import AgentInterface 16 | from src.types.common import ArgFormat 17 | 18 | class TestBaseTool: 19 | """Test suite for BaseTool class.""" 20 | 21 | def setup_method(self): 22 | """Setup for each test method.""" 23 | # Save the original registry and clear it for testing 24 | self.original_registry = dict(tool_registry) 25 | tool_registry.clear() 26 | 27 | def teardown_method(self): 28 | """Teardown after each test method.""" 29 | # Restore the original registry after each test 30 | tool_registry.clear() 31 | tool_registry.update(self.original_registry) 32 | 33 | def test_tool_registration(self): 34 | """Test that tools are properly registered through metaclass.""" 35 | # Define a test tool class 36 | class TestTool(BaseTool): 37 | TOOL_NAME = "test_tool" 38 | TOOL_DESCRIPTION = "A test tool for registration" 39 | 40 | async def run(self) -> ToolResult: 41 | return ToolResult(tool_name=self.TOOL_NAME, success=True) 42 | 43 | @classmethod 44 | def generate_examples(cls): 45 | return [] 46 | 47 | # Verify the tool was registered correctly 48 | assert "test_tool" in tool_registry 49 | assert tool_registry["test_tool"] == TestTool 50 | 51 | @pytest.mark.asyncio 52 | async def test_tool_examples(self): 53 | """Test that generate_examples returns valid examples.""" 54 | # Define a test tool with examples 55 | class ExampleTool(BaseTool): 56 | TOOL_NAME = "example_tool" 57 | TOOL_DESCRIPTION = "Test tool with examples" 58 | 59 | async def run(self) -> ToolResult: 60 | return ToolResult(tool_name=self.TOOL_NAME, success=True) 61 | 62 | @classmethod 63 | def generate_examples(cls): 64 | # Return a minimal valid example 65 | mock_agent = Mock(spec=AgentInterface) 66 | tool_instance = cls(calling_agent=mock_agent) 67 | tool_result = ToolResult(tool_name=cls.TOOL_NAME, success=True) 68 | return [(tool_instance, tool_result)] 69 | 70 | # Check examples format 71 | examples = ExampleTool.generate_examples() 72 | 73 | assert isinstance(examples, list) 74 | assert len(examples) == 1 75 | example = examples[0] 76 | assert isinstance(example, tuple) 77 | assert len(example) == 2 78 | assert isinstance(example[0], ExampleTool) 79 | assert isinstance(example[1], ToolResult) 80 | 81 | @pytest.mark.asyncio 82 | async def test_args_str_to_dict(self): 83 | """Test XML and JSON argument parsing.""" 84 | from pydantic import Field 85 | 86 | class ArgTool(BaseTool): 87 | TOOL_NAME = "arg_tool" 88 | TOOL_DESCRIPTION = "Test tool with arguments" 89 | 90 | arg1: str = Field(..., description="Test argument") 91 | arg2: int = Field(default=0, description="Optional argument") 92 | 93 | async def run(self) -> ToolResult: 94 | return ToolResult(tool_name=self.TOOL_NAME, success=True) 95 | 96 | @classmethod 97 | def generate_examples(cls): 98 | return [] 99 | 100 | # Test XML parsing 101 | xml_args = """ 102 | 103 | test 104 | 42 105 | 106 | """ 107 | args_dict, warnings = await ArgTool.args_str_to_dict(xml_args, ArgFormat.XML) 108 | assert args_dict is not None 109 | assert args_dict["arg1"] == "test" 110 | assert args_dict["arg2"] == 42 111 | assert warnings is None 112 | 113 | # Test bad XML - this should result in a warning and possibly a None args_dict 114 | # or a dict with only default values, depending on the implementation 115 | bad_xml = "test" 116 | args_dict, warnings = await ArgTool.args_str_to_dict(bad_xml, ArgFormat.XML) 117 | # The important thing is that a warning is generated 118 | assert warnings is not None 119 | 120 | # We don't make assumptions about whether args_dict is None or partially populated 121 | # as implementation details can vary. If it's None, the test passes. 122 | # If not None, check that it doesn't contain the required field or that it does have defaults. 123 | if args_dict is not None: 124 | # It might contain default values but not the required field 125 | assert "arg1" not in args_dict, "Required field should not be present in malformed XML" 126 | # Optionally check if default values are preserved 127 | # We don't assert this as it's an implementation detail that could change 128 | # assert args_dict.get("arg2") == 0, "Default value should be present" 129 | 130 | @pytest.mark.asyncio 131 | async def test_tool_result_formatting(self): 132 | """Test that tool results are properly formatted.""" 133 | # Create a mock agent for testing 134 | mock_agent = Mock(spec=AgentInterface) 135 | 136 | # Define a simple test tool 137 | class ResultTool(BaseTool): 138 | TOOL_NAME = "result_tool" 139 | TOOL_DESCRIPTION = "Test tool for result formatting" 140 | 141 | async def run(self) -> ToolResult: 142 | return ToolResult( 143 | tool_name=self.TOOL_NAME, 144 | success=True, 145 | output="test output", 146 | warnings="test warning", 147 | errors=None 148 | ) 149 | 150 | @classmethod 151 | def generate_examples(cls): 152 | return [] 153 | 154 | # Test successful tool execution 155 | tool = ResultTool(calling_agent=mock_agent) 156 | result = await tool.run() 157 | 158 | # Check result structure 159 | assert isinstance(result, ToolResult) 160 | assert result.tool_name == "result_tool" 161 | assert result.success is True 162 | assert "test output" in str(result) 163 | assert "test warning" in str(result) 164 | 165 | # Test failure result formatting 166 | failure_result = ToolResult( 167 | tool_name="fail_tool", 168 | success=False, 169 | output=None, 170 | warnings=None, 171 | errors="test error" 172 | ) 173 | 174 | # Check failure result structure 175 | assert isinstance(failure_result, ToolResult) 176 | assert failure_result.tool_name == "fail_tool" 177 | assert failure_result.success is False 178 | assert "test error" in str(failure_result) 179 | assert "SUCCESS" not in str(failure_result) 180 | assert "FAILURE" in str(failure_result) 181 | 182 | if __name__ == "__main__": 183 | # Run the tests directly for debugging 184 | pytest.main(["-xvs", __file__]) 185 | -------------------------------------------------------------------------------- /base_agent/tests/utils/test_parsing.py: -------------------------------------------------------------------------------- 1 | # Self-Improving Coding Agent 2 | # Copyright (c) 2025 Maxime Robeyns 3 | # 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Tests for the parsing utilities module. 8 | """ 9 | import re 10 | import pytest 11 | from src.utils.parsing import ( 12 | extract_before_last, 13 | extract_after_last, 14 | extract_after_first, 15 | extract_between_patterns, 16 | parse_number_from_string, 17 | ) 18 | 19 | 20 | # Test extract_before_last 21 | @pytest.mark.parametrize( 22 | "text, pattern, keep_pattern, expected", 23 | [ 24 | ("hello world hello", "hello", False, "hello world "), # Basic case 25 | ("hello world hello", "hello", True, "hello world hello"), # Keep pattern 26 | ("no pattern here", "xyz", False, ""), # Pattern not found 27 | ("", "hello", False, ""), # Empty string 28 | ("hello", "hello", False, ""), # Pattern at end 29 | ], 30 | ids=["basic", "keep_pattern", "not_found", "empty", "end_pattern"], 31 | ) 32 | def test_extract_before_last(text, pattern, keep_pattern, expected): 33 | result = extract_before_last(text, pattern, keep_pattern) 34 | assert result == expected, f"Expected '{expected}', got '{result}'" 35 | 36 | 37 | # Test extract_after_last 38 | @pytest.mark.parametrize( 39 | "text, pattern, keep_pattern, expected", 40 | [ 41 | ("hello world hello", "hello", False, ""), # Last occurrence at end 42 | ("hello world hello", "hello", True, "hello"), # Keep pattern 43 | ("hello world hello", "world", False, " hello"), # Middle occurrence 44 | ("no pattern here", "xyz", False, ""), # Pattern not found 45 | ("hello", "hello", True, "hello"), # Single pattern 46 | ], 47 | ids=["end", "keep_pattern", "middle", "not_found", "single"], 48 | ) 49 | def test_extract_after_last(text, pattern, keep_pattern, expected): 50 | result = extract_after_last(text, pattern, keep_pattern) 51 | assert result == expected 52 | 53 | 54 | # Test extract_after_first 55 | @pytest.mark.parametrize( 56 | "text, pattern, keep_pattern, expected", 57 | [ 58 | ("hello world hello", "hello", False, " world hello"), # First occurrence 59 | ("hello world hello", "hello", True, "hello world hello"), # Keep pattern 60 | ("no pattern here", "xyz", False, ""), # Pattern not found 61 | ("hello", "he", False, "llo"), # Partial pattern 62 | ("", "xyz", False, ""), # Empty string 63 | ], 64 | ids=["basic", "keep_pattern", "not_found", "partial", "empty"], 65 | ) 66 | def test_extract_after_first(text, pattern, keep_pattern, expected): 67 | result = extract_after_first(text, pattern, keep_pattern) 68 | assert result == expected 69 | 70 | 71 | # Test extract_between_patterns 72 | @pytest.mark.parametrize( 73 | "text, pattern_a, pattern_b, a_occ, b_occ, expected", 74 | [ 75 | # First/Last combinations 76 | ("start middle end", "start", "end", "first", "last", " middle "), 77 | ("a b a c a d", "a", "a", "first", "last", " b a c "), 78 | ("a b a c a d", "a", "a", "last", "first", None), # Invalid range 79 | # Pattern not found 80 | ("hello world", "xyz", "abc", "first", "last", None), 81 | ("hello world", "hello", "xyz", "first", "last", None), 82 | # Edge cases 83 | ("", "a", "b", "first", "last", None), # Empty string 84 | ("abc", "a", "c", "first", "last", "b"), # Adjacent patterns 85 | ], 86 | ids=[ 87 | "first_last", 88 | "multiple_a_last", 89 | "invalid_range", 90 | "a_missing", 91 | "b_missing", 92 | "empty", 93 | "adjacent", 94 | ], 95 | ) 96 | def test_extract_between_patterns(text, pattern_a, pattern_b, a_occ, b_occ, expected): 97 | result = extract_between_patterns(text, pattern_a, pattern_b, a_occ, b_occ) 98 | assert result == expected 99 | 100 | 101 | # Test extract_between_patterns with invalid occurrence values 102 | @pytest.mark.parametrize( 103 | "a_occ, b_occ", 104 | [("invalid", "first"), ("first", "invalid")], 105 | ids=["invalid_a", "invalid_b"], 106 | ) 107 | def test_extract_between_patterns_invalid_occurrence(a_occ, b_occ): 108 | with pytest.raises(ValueError, match="Invalid value for.*occurrence"): 109 | extract_between_patterns("text", "a", "b", a_occ, b_occ) 110 | 111 | 112 | # Fixture for parse_number_from_string tests 113 | @pytest.fixture 114 | def number_parser(): 115 | return parse_number_from_string 116 | 117 | 118 | # Test parse_number_from_string 119 | @pytest.mark.parametrize( 120 | "input_str, expected", 121 | [ 122 | # Successful cases 123 | ("42", (True, 42.0, None)), 124 | ("-3.14", (True, -3.14, None)), 125 | ("1,234.56", (True, 1234.56, None)), # Commas removed 126 | (" 6.022e23 ", (True, 6.022e23, None)), # Scientific notation 127 | # Success with warning 128 | ("42 extra text", (True, 42.0, "Warning: Found additional text.*")), 129 | # Failure cases 130 | ("no number here", (False, None, "Could not find a number.*")), 131 | ("", (False, None, "Could not find a number.*")), 132 | ], 133 | ids=[ 134 | "integer", 135 | "negative_float", 136 | "comma_float", 137 | "scientific", 138 | "extra_text", 139 | "no_number", 140 | "empty", 141 | ], 142 | ) 143 | def test_parse_number_from_string(number_parser, input_str, expected): 144 | success, value, message = number_parser(input_str) 145 | assert success == expected[0] 146 | assert value == expected[1] 147 | if message is not None and expected[2] is not None: 148 | assert re.match(expected[2], message) # Match regex pattern for message 149 | else: 150 | assert message == expected[2] 151 | 152 | 153 | # Example of a slow test (for demonstration) 154 | @pytest.mark.slow 155 | def test_parse_number_from_string_slow(number_parser): 156 | import time 157 | time.sleep(1) # Simulate slow operation 158 | success, value, _ = number_parser("12345") 159 | assert success and value == 12345.0 160 | -------------------------------------------------------------------------------- /benchmark_data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/benchmark_data/.gitkeep -------------------------------------------------------------------------------- /figures/agent_execution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/figures/agent_execution.png -------------------------------------------------------------------------------- /figures/agent_loop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/figures/agent_loop.png -------------------------------------------------------------------------------- /results/interactive_output/agent_outputs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/results/interactive_output/agent_outputs/.gitkeep -------------------------------------------------------------------------------- /sandbox/Dockerfile: -------------------------------------------------------------------------------- 1 | # Based on Fedora 2 | FROM fedora:42 3 | 4 | # Accept TARGET_ARCH build argument 5 | ARG TARGET_ARCH=x86_64 6 | 7 | # Set up the environment variables 8 | ENV SANDBOX_DIR=/home/agent \ 9 | SHELL=/bin/bash \ 10 | TZ=Etc/UTC \ 11 | DEBIAN_FRONTEND=noninteractive \ 12 | PATH=/opt/miniconda3/bin:$PATH 13 | 14 | # Setup agent user with sudo access 15 | RUN useradd -m -d /home/agent -s ${SHELL} agent && \ 16 | echo "agent ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/agent && \ 17 | echo "Defaults env_keep += \"PATH\"" >> /etc/sudoers.d/agent && \ 18 | chmod 0440 /etc/sudoers.d/agent 19 | 20 | # Install common dev tools 21 | RUN dnf -y install dnf-plugins-core && \ 22 | dnf -y remove selinux-policy* && \ 23 | dnf -y update && \ 24 | dnf -y install \ 25 | gcc gcc-c++ make git git-lfs llvm llvm-devel clang clang-devel \ 26 | nodejs python3.12 python3.12-devel cmake openssh-server \ 27 | tmux lsof strace gdb ltrace valgrind inotify-tools jq pv bzip2 unzip \ 28 | p7zip wget curl sudo file tree which gettext-envsubst patch openssl \ 29 | rsync zip nmap-ncat ripgrep perf poppler-utils lapack-devel blas-devel \ 30 | openssl-devel libffi-devel procps-ng sysstat htop \ 31 | libtiff-devel golang awk \ 32 | # System and networking utilities 33 | hostname net-tools iproute iputils bind-utils tcpdump traceroute mtr \ 34 | psmisc lsof netcat telnet whois tar gzip less findutils 35 | 36 | # Install Miniconda in /opt and set permissions 37 | USER root 38 | RUN mkdir -p /opt/miniconda3 && \ 39 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${TARGET_ARCH}.sh -O /opt/miniconda3/miniconda.sh && \ 40 | bash /opt/miniconda3/miniconda.sh -b -u -p /opt/miniconda3 && \ 41 | rm /opt/miniconda3/miniconda.sh && \ 42 | chown -R agent:agent /opt/miniconda3 && \ 43 | chmod -R u+w /opt/miniconda3 44 | 45 | 46 | # Configure conda 47 | RUN /opt/miniconda3/bin/conda init --all && \ 48 | /opt/miniconda3/bin/conda config --append channels conda-forge 49 | 50 | # Create system-wide conda initialization 51 | RUN echo '. /opt/miniconda3/etc/profile.d/conda.sh' >> /etc/bashrc && \ 52 | echo 'source /opt/miniconda3/bin/activate' >> /etc/bashrc && \ 53 | mkdir -p /etc/profile.d && \ 54 | echo '. /opt/miniconda3/etc/profile.d/conda.sh' >> /etc/profile.d/conda.sh && \ 55 | echo 'source /opt/miniconda3/bin/activate' >> /etc/profile.d/conda.sh && \ 56 | chmod +x /etc/profile.d/conda.sh 57 | 58 | # Switch back to root for system configurations 59 | USER root 60 | 61 | # Setup Python 3.12 as default python 62 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 && \ 63 | alternatives --set python3 /usr/bin/python3.12 && \ 64 | alternatives --install /usr/bin/python python /usr/bin/python3 1 65 | 66 | # Create necessary directories for pnpm 67 | RUN mkdir -p ${SANDBOX_DIR}/.local/share/pnpm && \ 68 | touch ${SANDBOX_DIR}/.bashrc && \ 69 | chown -R agent:agent ${SANDBOX_DIR}/.local && \ 70 | chown agent:agent ${SANDBOX_DIR}/.bashrc && \ 71 | chown agent:agent ${SANDBOX_DIR} 72 | 73 | COPY configs/gitignore ${SANDBOX_DIR}/.gitignore 74 | RUN chown agent:agent ${SANDBOX_DIR}/.gitignore && \ 75 | chmod +w ${SANDBOX_DIR}/.gitignore 76 | 77 | # Switch back to agent user for remaining setup 78 | USER agent 79 | WORKDIR ${SANDBOX_DIR} 80 | 81 | # Set directory permissions 82 | RUN mkdir -p ${SANDBOX_DIR}/.ssh && \ 83 | chmod 700 ${SANDBOX_DIR}/.ssh && \ 84 | touch ${SANDBOX_DIR}/.ssh/authorized_keys && \ 85 | chmod 600 ${SANDBOX_DIR}/.ssh/authorized_keys 86 | 87 | RUN curl https://raw.githubusercontent.com/github/gitignore/main/Python.gitignore >> ${SANDBOX_DIR}/.gitignore && \ 88 | curl https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore >> ${SANDBOX_DIR}/.gitignore 89 | 90 | # Install and setup pnpm 91 | ENV PNPM_HOME=${SANDBOX_DIR}/.local/share/pnpm 92 | ENV PATH=$PNPM_HOME:$PATH 93 | ENV NODE_OPTIONS=--max_old_space_size=4096 94 | RUN curl -fsSL https://get.pnpm.io/install.sh | ENV="${SANDBOX_DIR}/.bashrc" SHELL="/bin/bash" bash - && \ 95 | . ${SANDBOX_DIR}/.bashrc && \ 96 | echo "export PNPM_HOME=$PNPM_HOME" >> ${SANDBOX_DIR}/.bashrc && \ 97 | echo "export PATH=$PNPM_HOME:\$PATH" >> ${SANDBOX_DIR}/.bashrc && \ 98 | . ${SANDBOX_DIR}/.bashrc && \ 99 | pnpm install -g typescript ts-node @types/node prettier eslint tsx 100 | 101 | # Install and configure Rust using rustup 102 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ 103 | . ${SANDBOX_DIR}/.cargo/env && \ 104 | rustup component add rust-src && \ 105 | echo '. ${SANDBOX_DIR}/.cargo/env' >> ${SANDBOX_DIR}/.bashrc 106 | 107 | # Install LSP Servers for common languages with architecture awareness 108 | RUN . ${SANDBOX_DIR}/.bashrc && \ 109 | # Python - Pyright 110 | sudo dnf install -y npm && \ 111 | sudo npm install -g pyright && \ 112 | # JavaScript/TypeScript 113 | pnpm install -g typescript-language-server typescript && \ 114 | # Rust - Install and configure Rust using rustup 115 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \ 116 | . ${SANDBOX_DIR}/.cargo/env && \ 117 | rustup component add rust-src rust-analyzer && \ 118 | echo '. ${SANDBOX_DIR}/.cargo/env' >> ${SANDBOX_DIR}/.bashrc \ 119 | # Go - Install gopls 120 | go install golang.org/x/tools/gopls@latest 121 | 122 | # Configure environment 123 | ENV HOME=${SANDBOX_DIR} 124 | 125 | # Copy and install some base requirements 126 | COPY base_requirements.txt /tmp/base_requirements.txt 127 | RUN pip install -r /tmp/base_requirements.txt && \ 128 | sudo rm /tmp/base_requirements.txt 129 | 130 | # Copy and install agent dependencies (maintaining current approach) 131 | COPY --from=base_agent --chown=agent:agent . /tmp/base_agent 132 | RUN cd /tmp/base_agent && pip install -r requirements.txt 133 | 134 | WORKDIR ${SANDBOX_DIR} 135 | 136 | # Expose necessary ports (maintaining current approach) 137 | EXPOSE 5000 80 22 443 8080 8000 138 | 139 | ARG ANTHROPIC_API_KEY 140 | ARG OPENAI_API_KEY 141 | ARG FIREWORKS_AI_API_KEY 142 | ARG GEMINI_API_KEY 143 | ARG DEEPSEEK_API_KEY 144 | ARG VERTEX_PROJECT_ID 145 | 146 | ENV ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} 147 | ENV OPENAI_API_KEY=${OPENAI_API_KEY} 148 | ENV FIREWORKS_AI_API_KEY=${FIREWORKS_AI_API_KEY} 149 | ENV GEMINI_API_KEY=${GEMINI_API_KEY} 150 | ENV DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY} 151 | ENV VERTEX_PROJECT_ID=${VERTEX_PROJECT_ID} 152 | 153 | COPY GOOGLE_APPLICATION_CREDENTIALS.json /tmp/GOOGLE_APPLICATION_CREDENTIALS.json 154 | ENV GOOGLE_APPLICATION_CREDENTIALS=/tmp/GOOGLE_APPLICATION_CREDENTIALS.json 155 | 156 | # Set the entrypoint (maintaining current approach) 157 | CMD ["/bin/bash", "--login"] 158 | -------------------------------------------------------------------------------- /sandbox/GOOGLE_APPLICATION_CREDENTIALS.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "service_account", 3 | "project_id": "", 4 | "private_key_id": "", 5 | "private_key": "", 6 | "client_email": "", 7 | "client_id": "", 8 | "auth_uri": "", 9 | "token_uri": "", 10 | "auth_provider_x509_cert_url": "", 11 | "client_x509_cert_url": "", 12 | "universe_domain": "" 13 | } 14 | -------------------------------------------------------------------------------- /sandbox/base_requirements.txt: -------------------------------------------------------------------------------- 1 | # System-wide Python packages for development 2 | # Intentionally _not_ pinning versions so we get recent versions on every build 3 | black 4 | flake8 5 | -------------------------------------------------------------------------------- /sandbox/configs/gitignore: -------------------------------------------------------------------------------- 1 | # IDEs 2 | .idea/ 3 | .vscode/ 4 | *.swp 5 | *.swo 6 | 7 | # Build outputs 8 | target/ 9 | dist/ 10 | build/ 11 | *.o 12 | *.a 13 | *.so 14 | 15 | # Logs & temp 16 | *.log 17 | tmp/ 18 | temp/ 19 | 20 | # Directories to ignore at any depth 21 | **/.maestro 22 | **/.vscode 23 | **/.vscode-server 24 | **/.ssh 25 | -------------------------------------------------------------------------------- /sandbox/configs/sandbox_bashrc: -------------------------------------------------------------------------------- 1 | # sandbox_bashrc 2 | 3 | # Guard against sourcing multiple times 4 | if [ -n "$SANDBOX_BASHRC_SOURCED" ]; then 5 | return 6 | fi 7 | export SANDBOX_BASHRC_SOURCED=1 8 | 9 | # If not running interactively, don't do anything 10 | [[ $- != *i* ]] && return 11 | 12 | # User-specific environment 13 | if ! [[ "$PATH" =~ "$HOME/.local/bin:$HOME/bin:" ]] 14 | then 15 | PATH="$HOME/.local/bin:$HOME/bin:$PATH" 16 | fi 17 | export PATH 18 | 19 | # User specific aliases and functions 20 | if [ -d ~/.bashrc.d ]; then 21 | for rc in ~/.bashrc.d/*; do 22 | if [ -f "$rc" ]; then 23 | . "$rc" 24 | fi 25 | done 26 | fi 27 | 28 | unset rc 29 | -------------------------------------------------------------------------------- /scripts/install_swebench_harness.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | git clone https://github.com/swe-bench/SWE-bench 3 | cd SWE-bench 4 | pip install -e . 5 | --------------------------------------------------------------------------------