├── .gitignore
├── CITATION.cff
├── LICENSE
├── Makefile
├── README.md
├── base_agent
├── .gitignore
├── README.md
├── __main__.py
├── agent.py
├── agent_change_log.md
├── conftest.py
├── description.txt
├── pytest.ini
├── requirements.txt
├── src
│ ├── __init__.py
│ ├── agents
│ │ ├── __init__.py
│ │ ├── agent_calling.py
│ │ ├── assistant_base_agent.py
│ │ ├── base_agent.py
│ │ └── implementations
│ │ │ ├── __init__.py
│ │ │ ├── archive_explorer.py
│ │ │ ├── coder.py
│ │ │ ├── main_orchestrator.py
│ │ │ ├── problem_solver.py
│ │ │ ├── reasoner.py
│ │ │ └── review_committee_member.py
│ ├── benchmarks
│ │ ├── __init__.py
│ │ ├── aime.py
│ │ ├── aiq_benchmark.py
│ │ ├── aiq_project_benchmarks.py
│ │ ├── arc_agi.py
│ │ ├── base.py
│ │ ├── drop.py
│ │ ├── file_editing.py
│ │ ├── gpqa.py
│ │ ├── gsm8k.py
│ │ ├── gsm_ic.py
│ │ ├── humaneval.py
│ │ ├── livecodebench.py
│ │ ├── math.py
│ │ ├── refute.py
│ │ ├── swebench_verified.py
│ │ └── symbol_location.py
│ ├── callgraph
│ │ ├── __init__.py
│ │ ├── digraph.py
│ │ ├── manager.py
│ │ └── reporting.py
│ ├── config.py
│ ├── events
│ │ ├── __init__.py
│ │ ├── event_bus.py
│ │ └── event_bus_utils.py
│ ├── llm
│ │ ├── __init__.py
│ │ ├── api.py
│ │ ├── base.py
│ │ ├── metering.py
│ │ └── providers
│ │ │ ├── __init__.py
│ │ │ ├── anthropic.py
│ │ │ ├── base_provider.py
│ │ │ ├── deepseek.py
│ │ │ ├── fireworks.py
│ │ │ ├── google.py
│ │ │ ├── google_oai.py
│ │ │ ├── google_rest.py
│ │ │ ├── openai.py
│ │ │ └── vertex.py
│ ├── oversight
│ │ ├── graph_visualisation.py
│ │ └── overseer.py
│ ├── schemas
│ │ ├── __init__.py
│ │ ├── json_parsing.py
│ │ ├── representation.py
│ │ ├── xml_dumps.py
│ │ └── xml_parsing.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── answer_submission.py
│ │ ├── archive_tools.py
│ │ ├── base_agent_tools.py
│ │ ├── base_tool.py
│ │ ├── calculator.py
│ │ ├── committee_design.py
│ │ ├── directory_tools.py
│ │ ├── edit_tools
│ │ │ ├── __init__.py
│ │ │ ├── overwrite_file.py
│ │ │ └── utils.py
│ │ ├── execute_command.py
│ │ ├── file_tools.py
│ │ ├── reasoning_structures
│ │ │ ├── __init__.py
│ │ │ ├── coding.py
│ │ │ ├── meta_improvement.py
│ │ │ ├── sequential.py
│ │ │ └── sequential_subagents.py
│ │ └── ripgrep_tool.py
│ ├── types
│ │ ├── __init__.py
│ │ ├── agent_types.py
│ │ ├── common.py
│ │ ├── event_types.py
│ │ ├── llm_types.py
│ │ └── tool_types.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── archive_analysis.py
│ │ ├── documentation.py
│ │ ├── file_views.py
│ │ ├── metrics.py
│ │ ├── parsing.py
│ │ └── stop_tokens.py
│ └── web_server
│ │ ├── __init__.py
│ │ ├── server.py
│ │ ├── static
│ │ ├── components
│ │ │ ├── event-stream.js
│ │ │ ├── execution-tree.js
│ │ │ └── metrics-display.js
│ │ ├── core.js
│ │ ├── store.js
│ │ ├── styles.css
│ │ ├── utils
│ │ │ ├── event-utils.js
│ │ │ └── formatters.js
│ │ └── visualizer.js
│ │ └── templates
│ │ └── index.html
└── tests
│ ├── agents
│ └── test_agent_calling.py
│ ├── benchmarks
│ ├── disabled_test_livecode_benchmark.py
│ ├── test_benchmark_base.py
│ ├── test_file_editing.py
│ ├── test_gsm8k_benchmark.py
│ └── test_refute_benchmark.py
│ ├── events
│ └── test_event_bus.py
│ ├── test_example.py
│ ├── tools
│ ├── reasoning_structures
│ │ └── test_sequential.py
│ ├── test_base_tool.py
│ ├── test_calculator.py
│ └── test_execute_command.py
│ └── utils
│ ├── test_archive_analysis.py
│ └── test_parsing.py
├── benchmark_data
└── .gitkeep
├── figures
├── agent_execution.png
└── agent_loop.png
├── results
└── interactive_output
│ └── agent_outputs
│ └── .gitkeep
├── runner.py
├── sandbox
├── Dockerfile
├── GOOGLE_APPLICATION_CREDENTIALS.json
├── base_requirements.txt
└── configs
│ ├── gitignore
│ └── sandbox_bashrc
└── scripts
└── install_swebench_harness.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | **/__pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | ### VisualStudioCode
132 | .vscode/*
133 | !.vscode/settings.json
134 | !.vscode/tasks.json
135 | !.vscode/launch.json
136 | !.vscode/extensions.json
137 | *.code-workspace
138 | **/.vscode
139 |
140 | # JetBrains
141 | .idea/
142 |
143 | # Data & Models
144 | *.h5
145 | *.tar
146 | *.tar.gz
147 |
148 | # Lightning-Hydra-Template
149 | configs/local/default.yaml
150 | /data/
151 | /logs/
152 | .env
153 |
154 | # Aim logging
155 | .aim
156 |
157 | # Custom files and directories
158 | third_party
159 | benchmark_data/aiq_bench
160 | benchmark_data/file_editing_bench
161 | benchmark_data/symbol_location_bench
162 | check_boilerplate.sh
163 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.1.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: Robeyns
5 | given-names: Maxime
6 | orcid: https://orcid.org/0000-0001-9802-9597
7 | - family-names: Szummer
8 | given-names: Martin
9 | - family-names: Laurence
10 | given-names: Aitchison
11 | title: "Self-Improving Coding Agent"
12 | version: 0.0.1
13 | date-released: 2025-04-12
14 | repository-code: "https://github.com/MaximeRobeyns/self_improving_coding_agent"
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2025 Maxime Robeyns
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: test
2 |
3 | PWD := $(shell pwd)
4 |
5 | int: ## Interactive run; uses default shell entrypoint
6 | @echo 'Once in the container, type:'
7 | @echo 'python -m agent_code.agent -s -p ""'
8 | @echo 'Watch the agent work on localhost:8080'
9 | docker run --rm -ti \
10 | -p 8080:8080 \
11 | -v ${PWD}/base_agent:/home/agent/agent_code:ro \
12 | -v ${PWD}/results/interactive_output:/home/agent/workdir:rw \
13 | sica_sandbox
14 |
15 | test: ## Run the unit tests for the agent
16 | @pytest base_agent
17 |
18 | image: ## Docker image for x86_64
19 | @ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
20 | OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
21 | FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
22 | GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
23 | DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
24 | VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
25 | docker buildx build --build-context base_agent=./base_agent \
26 | -f sandbox/Dockerfile \
27 | -t sica_sandbox \
28 | --build-arg TARGET_ARCH=x86_64 \
29 | --build-arg ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
30 | --build-arg OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
31 | --build-arg FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
32 | --build-arg GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
33 | --build-arg DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
34 | --build-arg VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
35 | --load sandbox
36 |
37 | image-mac: ## Docker image for apple silicon
38 | @ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
39 | OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
40 | FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
41 | GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
42 | DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
43 | VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
44 | docker buildx build --build-context base_agent=./base_agent \
45 | -f sandbox/Dockerfile \
46 | -t sica_sandbox \
47 | --build-arg TARGET_ARCH=aarch64 \
48 | --build-arg ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
49 | --build-arg OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
50 | --build-arg FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
51 | --build-arg GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
52 | --build-arg DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
53 | --build-arg VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
54 | --load sandbox
55 |
56 | docs: ## Compile documentation
57 | python base_agent/utils/documentation.py base_agent > base_agent/DOCUMENTATION.md
58 |
59 | meta: ## Run the meta-agent agent directly for testing (see manual request in __main__.py)
60 | rm -rf results/meta
61 | mkdir -p results/meta/test_logs
62 | cp -r base_agent results/meta/agent_iter
63 | # Copy an existing archive so that the meta agent has something to work with
64 | cp -r results/run_1 results/meta/archive
65 | @echo localhost:8080
66 | docker run --rm -ti \
67 | -p 8080:8080 \
68 | -v ${PWD}/base_agent:/home/agent/meta:ro \
69 | -v ${PWD}/results/meta/archive:/home/agent/archive:ro \
70 | -v ${PWD}/results/meta/agent_iter:/home/agent/workdir:rw \
71 | -v ${PWD}/results/meta/test_logs:/home/agent/meta_logdir:rw \
72 | sica_sandbox python -m meta improve \
73 | --workdir /home/agent/workdir \
74 | --logdir /home/agent/meta_logdir
75 |
76 | test_meta_int: ## Interactivley test the resulting agent from the target above
77 | docker run --rm -ti \
78 | -p 8080:8080 \
79 | -p 8000:8000 \
80 | -v ${PWD}/results/meta/agent_iter:/home/agent/agent_code:ro \
81 | -v ${PWD}/results/meta/test_output:/home/agent/workdir:rw \
82 | sica_sandbox
83 |
84 |
85 | help:
86 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
87 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Self-Improving Coding Agent
3 | A coding agent experiment, that works on its own codebase.
4 |
5 |
6 |
7 |
8 |
9 | The system operates as an iterative improvement loop:
10 | 1. evaluating the current agent version on some benchmark tasks to capture how well it does
11 | 2. storing the results in an archive
12 | 3. running the agent on its own codebase to work on an improvement
13 | 4. going back to step 1 with the updated agent code
14 |
15 | See [our workshop paper](https://openreview.net/pdf?id=rShJCyLsOr) for more details.
16 |
17 | ## Quickstart
18 |
19 | > IMPORTANT NOTE: always run the agent in the provided Docker container. Since the agent can execute shell commands, this offers some isolation from your host machine, avoiding inadvertent file system manipulation and similar risks.
20 |
21 | First, make sure you've cloned the repo
22 | ```bash
23 | git clone https://github.com/MaximeRobeyns/self_improving_coding_agent
24 | ```
25 |
26 | Then, export some environment variables which will be made available in the
27 | docker container. The project supports inference from a number of providers to
28 | allow for experimentation across many models. You must export at least one of
29 | these in your _local_ shell, which you can do either directly or with `direnv`,
30 | `dotenv`, etc. Omitting any provider key will simply make that provider's
31 | models unavailable to the agent.
32 |
33 | ```bash
34 | export ANTHROPIC_API_KEY= # For Claude models
35 | export OPENAI_API_KEY= # For GPT 4o and reasoning models (o1, o3, etc)
36 | export GEMINI_API_KEY= # For Gemini models
37 | export VERTEX_PROJECT_ID= # For models hosted on GCP's Vertex
38 | export FIREWORKS_AI_API_KEY= # For DeepSeek / Llama hosted on fireworks
39 | export DEEPSEEK_API_KEY= # For DeepSeek direct inference (V3, R1)
40 | export MODAL_TOKEN_ID= # To allow the agent to visit webpages and read papers
41 | export MODAL_TOKEN_SECRET= # To allow the agent to visit webpages and read papers
42 | ```
43 | For gemini, you should replace the template file in `sandbox/GOOGLE_APPLICATION_CREDENTIALS.json` with your own credentials.
44 |
45 | Once you have at least one LLM provider's API key exported, you can build the docker image. The build command is wrapped in a Makefile target for convenience:
46 |
47 | ```bash
48 | make image
49 | ```
50 |
51 | If you are using an apple silicon machine, use this target instead:
52 | ```
53 | make image-mac
54 | ```
55 |
56 | Finally, install the requirements in your local python environment:
57 | ```bash
58 | # remember to activate a virtual environment or equivalent here
59 | pip install -r base_agent/requirements.txt
60 | pip install swebench
61 | ```
62 |
63 | ### Testing the Agent
64 |
65 | To test if the setup was successful, you can run the agent interactively with a manually set initial prompt using this target
66 | ```bash
67 | make int
68 | ```
69 | This will start the docker container and attach your shell to it. You can then run
70 | ```bash
71 | python -m agent_code.agent --server true -p ""
72 | ```
73 | Then open your browser on http://localhost:8080 to follow the agent execution. This will show you an interactive webpage which visualises the events in the event bus / the agent callgraph, allowing you to click on individual events to see them in more detail, read overseer messages, and collapse sub-agent traces.
74 |
75 | 
76 |
77 | The agent's working directory is mapped to `results/interactive_output` and any files created will be available here on your machine. Agent logs will be in `results/interactive_output/agent_output`.
78 |
79 | You can see more options by doing
80 | ```bash
81 | make help
82 | ```
83 | or agent arguments wit
84 | ```bash
85 | python -m base_agent.agent --help
86 | ```
87 |
88 | To further configure the agent, including the choice of LLMs, edit `base_agent/src/config.py`.
89 |
90 | ## Self-Improvement Loop
91 |
92 | To run the self-improvement loop, first inspect the list of benchmarks in the `base_agent/src/benchmarks/__init__.py` file, and make sure that you have uncommented those you want to include. Then do
93 | ```bash
94 | python runner.py
95 | ```
96 | To see all the options, do
97 | ```bash
98 | python runner.py --help
99 | ```
100 | Common options might be
101 | ```bash
102 | python runner.py --id 1 --workers 6
103 | ```
104 |
105 | This will start the agent loop, placing the results in `results/run_`.
106 |
107 | ## Things to work on
108 |
109 | Here are some potential things to try and do with the agent framework:
110 |
111 | - [ ] get the agent to curate / build more of its own benchmarks
112 | - [ ] reduce the variance of self-improvement runs (early features often influence subsequent features)
113 | - [ ] use a stronger LLM to build a scaffold for a weaker LLM
114 | - [ ] find or create more realistic 'software engineering' benchmark tasks
115 |
116 | ## Agent Description
117 |
118 | The agent in `base_agent` is a minimal agent that can just about perform the
119 | meta-improvement task. It lacks efficient file editing tools, devtools such as
120 | tree sitter or LSP integrations, or advanced reasoning structures that would
121 | help it out when performing coding tasks. It has the necessary building blocks
122 | to bootstrap these features and specialise itself to the distribution of
123 | benchmark tasks included.
124 |
125 | Please see `base_agent/README.md` for a more detailed discussion of the base agent framework.
126 |
127 | ```
128 | ├── base_agent
129 | │ ├── agent_change_log.md
130 | │ ├── agent.py
131 | │ ├── conftest.py
132 | │ ├── description.txt
133 | │ ├── __main__.py
134 | │ ├── pytest.ini
135 | │ ├── README.md
136 | │ ├── requirements.txt
137 | │ ├── src
138 | │ │ ├── agents
139 | │ │ ├── benchmarks
140 | │ │ ├── callgraph
141 | │ │ ├── config.py
142 | │ │ ├── events
143 | │ │ ├── __init__.py
144 | │ │ ├── llm
145 | │ │ ├── oversight
146 | │ │ ├── schemas
147 | │ │ ├── tools
148 | │ │ ├── types
149 | │ │ ├── utils
150 | │ │ └── web_server
151 | │ └── tests
152 | │ ├── agents
153 | │ ├── benchmarks
154 | │ ├── events
155 | │ ├── __pycache__
156 | │ ├── test_example.py
157 | │ ├── tools
158 | │ └── utils
159 | ├── benchmark_data
160 | ├── results
161 | │ ├── run_
162 | │ └── interactive_output
163 | ├── runner.py
164 | └── sandbox
165 | ```
166 |
167 | ### Results Organization
168 |
169 | ```
170 | results/run_{id}/
171 | ├── metadata.json # Experiment metadata
172 | └── agent_{i}/ # Agent iteration directory
173 | ├── agent_code/ # Agent implementation
174 | ├── benchmarks/ # Benchmark results
175 | │ └── {bench_name}/
176 | │ ├── results.jsonl # Per-problem results
177 | │ ├── perf.jsonl # Summary metrics
178 | │ └── traces/ # Detailed traces
179 | └── meta_improvement/ # Improvement logs
180 | ```
181 |
182 | ## Citation
183 |
184 | ```
185 | @inproceedings{
186 | robeyns2025sica,
187 | title={{SICA} A Self-Improving Coding Agent},
188 | author={Maxime Robeyns, Martin Szummer, and Laurence Aitchison},
189 | booktitle={ICLR 2025 Workshop on Scaling Self-Improving Foundation Models},
190 | year={2025},
191 | url={https://openreview.net/forum?id=rShJCyLsOr}
192 | }
193 | ```
194 |
--------------------------------------------------------------------------------
/base_agent/.gitignore:
--------------------------------------------------------------------------------
1 | ENV_VARS
2 |
--------------------------------------------------------------------------------
/base_agent/agent_change_log.md:
--------------------------------------------------------------------------------
1 | # Agent Codebase Change Log
2 |
3 | | Iteration | Change Name | Was Successful? (pending/yes/no) |
4 | |-----------|-------------|----------------------------------|
5 | | 0 | Base Agent | yes |
6 |
7 |
8 | ## Iteration 0: Base Agent
9 |
10 | This is a template iteration which you should follow for following iterations.
11 |
12 | ### Feature Description
13 |
14 | This is to be written at iteration i (in this case, i=0). Describe the intention / motivation / hypothesis behind the change made.
15 |
16 | ### Feature Outcome
17 |
18 | This part is supposed to be written at iteration i + 1 (and potentially updated at subsequent iterations), and comments on the empirical effectiveness of the change.
19 |
20 | ## Iteration 1:
21 |
--------------------------------------------------------------------------------
/base_agent/conftest.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import pytest
7 |
8 | # Enable asyncio support for pytest
9 | pytest_plugins = ["pytest_asyncio"]
10 |
11 | # Optional: Define custom command-line options for your markers
12 | def pytest_addoption(parser):
13 | parser.addoption(
14 | "--run-llm",
15 | action="store_true",
16 | default=False,
17 | help="Run tests marked with 'uses_llm'",
18 | )
19 | parser.addoption(
20 | "--run-slow",
21 | action="store_true",
22 | default=False,
23 | help="Run tests marked with 'slow'",
24 | )
25 |
26 | # Skip tests based on markers unless the corresponding option is provided
27 | def pytest_collection_modifyitems(config, items):
28 | if not config.getoption("--run-llm"):
29 | skip_llm = pytest.mark.skip(reason="need --run-llm option to run")
30 | for item in items:
31 | if "uses_llm" in item.keywords:
32 | item.add_marker(skip_llm)
33 | if not config.getoption("--run-slow"):
34 | skip_slow = pytest.mark.skip(reason="need --run-slow option to run")
35 | for item in items:
36 | if "slow" in item.keywords:
37 | item.add_marker(skip_slow)
38 |
--------------------------------------------------------------------------------
/base_agent/description.txt:
--------------------------------------------------------------------------------
1 | This is the base, v0 agent that is used as a starting point.
2 |
--------------------------------------------------------------------------------
/base_agent/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | pythonpath = .
3 | # --ff for previously failed first
4 | # -l for print state on failure
5 | # -x for stop on first failure
6 | # -s for show stdout while testing
7 | # -v for verbose (e.g. show test names)
8 | # -n for n threadsafe parallel workers
9 | addopts = -l -x --ff -s -v
10 | testpaths = tests
11 | filterwarnings = ignore::DeprecationWarning
12 | asyncio_default_fixture_loop_scope = function
13 | markers =
14 | uses_llm: marks tests as using llms (run with '--run-llm')
15 | asyncio: marks tests as asynchronous
16 | integration: marks tests as integration tests
17 | slow: marks tests that run slowly
18 | performance: marks tests that benchmark performance (run with '-m performance')
19 |
--------------------------------------------------------------------------------
/base_agent/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonlines
2 | cryptography
3 | datasets
4 | tiktoken
5 | pydantic[email]
6 | pydantic-settings
7 | python-dotenv
8 | anthropic[vertex]==0.42.0
9 | tabulate
10 | openai
11 | json-repair
12 | rich
13 | jinja2
14 | fastapi
15 | uvicorn[standard]
16 | GitPython
17 | diff-match-patch
18 | swebench
19 | duckduckgo-search
20 | scipy
21 | sympy
22 | google-genai
23 | googlesearch-python
24 | pytest
25 | pytest-asyncio
26 | google-cloud-aiplatform
27 |
--------------------------------------------------------------------------------
/base_agent/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/__init__.py
--------------------------------------------------------------------------------
/base_agent/src/agents/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | The agents module defines the agents that can be composed and called to
8 | construct the broader scaffolding system. An agent might be thought of as a
9 | function in a program in the sense that they can be invoked, invoke other
10 | agents themselves, be composed and so forth - indeed, we maintain a 'callgraph'
11 | of the agent calls in the system.
12 |
13 | Individually, an "agent" is just a class that is used to carefully compose an
14 | LLM's context. The way the LLM sees the context is as follows:
15 |
16 | - a system prompt section, in which the "agent's" definition, goals, and
17 | available tools and sub-agents are defined
18 | - the first "user" message, referred to as the core prompt section, which is
19 | defined by the agent itself and which pertains to the way in which the agent
20 | should go about its execution; what sequence of steps it should follow, what
21 | it should focus on, what outcomes it should try to achieve. This is also
22 | where we put visualisations of system state such as file trees and file
23 | viewers.
24 | - the "assistant" message, which contains the agent's response and consists of
25 | alternating sequences of thought and tool or sub-agent calls. The
26 | 'function calling interface' for tools and sub-agents is very similar:
27 | consisting of an XML sequence whose last closing tag is a stop token. After
28 | being generated, the LLM will stop, the contents of the XML will be pasrsed
29 | to identify the tool or sub-agent name, and the arguments provided, these
30 | will be validated, the tool or sub-agent will be run, and the response will
31 | be serialised. These will then be concatenated to the previously generated
32 | assistant message, and the LLM will be called again with this as the
33 | assistant "pre-fill".
34 |
35 | Note, the way this is implemented, and the programming model to maintain, is
36 | that each agent maintains an 'event stream', published to the event bus. This
37 | is a list of events (such as new assistant messages, tool calls and results,
38 | agent calls and results, file events, overseer notifications and so forth)
39 | which describes the exection of the agent. The assistant message is
40 | reconstructed by filtering this event stream and concatenating the values. At a
41 | basic level, just the assistant messages and tool / agent results can be
42 | concatenated, although other event types can be included. For instance, the
43 | file open event may also be included here (with a view of the file content) in
44 | order to save re-generating the core prompt, which would cause a KV cache miss.
45 | By only appending to the LLM agent's context, we can avoid breaking to the
46 | cache, at the cost of lengthening it and potentially duplicating content -
47 | eventually it becomes more cost effective to consolidate all this file state
48 | into the core prompt, shorten the prompt yet re-calculate the KV cache.
49 |
50 | Also note that overseer notification events are handled slightly differently.
51 | When reconstructing the event stream, we stop the current assistant message,
52 | add the overseer notification in a new 'user' message, before continuing with
53 | the rest of the events in a new assistant pre-fill message.
54 | """
55 |
--------------------------------------------------------------------------------
/base_agent/src/agents/implementations/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Built-in agent agents providing core capabilities."""
7 |
8 | from ..base_agent import BaseAgent, AgentResult
9 |
10 |
11 | class DemoAgent(BaseAgent):
12 | """Agent for constructing examples in tools"""
13 |
14 | AGENT_NAME = "demo_agent"
15 | AGENT_DESCRIPTION = "a dummy agent for demonstration"
16 | SYSTEM_PROMPT = ""
17 |
18 | async def construct_core_prompt(self) -> str:
19 | return ""
20 |
21 | @classmethod
22 | def generate_examples(cls) -> list[tuple["BaseAgent", AgentResult]]:
23 | return []
24 |
--------------------------------------------------------------------------------
/base_agent/src/agents/implementations/problem_solver.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from typing import List
7 | from pathlib import Path
8 | from pydantic import Field
9 |
10 |
11 | from .reasoner import ReasoningAgent
12 | from ..base_agent import BaseAgent
13 | from ...config import settings
14 | from ...tools.calculator import Calculator
15 | from ...tools.directory_tools import ViewDirectory
16 | from ...tools.execute_command import ExecuteCommand
17 | from ...tools.file_tools import OpenFile, CloseFile
18 | from ...tools.edit_tools import OverwriteFile
19 | from ...tools.ripgrep_tool import RipGrepTool
20 | from ...tools.committee_design import ReviewCommittee
21 | from .coder import CodingAgent
22 | from ...utils.metrics import make_random_agent_metrics
23 | from ...types.agent_types import AgentStatus, AgentResult
24 | from ...events.event_bus_utils import get_problem_statement
25 |
26 |
27 | class ProblemSolvingAgent(BaseAgent):
28 | """
29 | A multi-purpose problem-solving agent with access to all tools and capabilities.
30 |
31 | This agent can:
32 | 1. Analyze and decompose complex problems
33 | 2. Plan and execute solutions systematically
34 | 3. Use a wide range of tools and agents
35 | 4. Validate and refine solutions
36 | 5. Handle errors and edge cases
37 | 6. Document and explain its process
38 | """
39 |
40 | AGENT_NAME = "general_problem_solver"
41 |
42 | AGENT_DESCRIPTION = """
43 | Your default agent for all tasks. Highly versatile, with broad tool access. Best for tasks requiring multiple capabilities or when specific agent choice isn't obvious.
44 |
45 | Note that the agent will not have the context that you have / be able to see the initial problem statement verbatim. It is up to you to accurately relay this to the sub-agent, or decompose it into sub-tasks if it is very long and repeating it verbatim would be slow and costly.
46 |
47 | Example capabilities
48 | - Problem decomposition and analysis
49 | - General purpose writing tasks
50 | - Basic Coding (although not specialised)
51 | - Quick System and file operations
52 | - Mathematical computation
53 | - Running shell commands
54 |
55 | Choose when:
56 | - Specific agent isn't clearly better
57 | - Need flexible approach
58 |
59 | Avoid when:
60 | - Task fits squarely in another agent's specialty
61 | - Requires deep domain expertise"""
62 |
63 | SYSTEM_PROMPT = """You are a very-competent problem solver who finds solutions swiftly and effectively.
64 |
65 | You should
66 | 1. Understand the sense of the problem you have been provided
67 | 2. Identify the optimal tools and methods you can use to solve your task
68 | 3. Swiftly execute on the problem
69 | 4. Continuously validate and check your work
70 |
71 | Aim for simple, elegant and correct solutions.
72 | """
73 |
74 | # Available tools - complete access to all tools
75 | # NOTE: ExitAgent and ReturnResult are automatically included
76 | AVAILABLE_TOOLS = {
77 | Calculator,
78 | ViewDirectory,
79 | ExecuteCommand,
80 | OpenFile,
81 | CloseFile,
82 | OverwriteFile,
83 | RipGrepTool,
84 | ReviewCommittee,
85 | }
86 |
87 | # Available agents
88 | # AVAILABLE_AGENTS = set()
89 | AVAILABLE_AGENTS = {ReasoningAgent, CodingAgent}
90 |
91 | HAS_FILEVIEW = True
92 |
93 | MODEL = settings.MODEL
94 | TEMPERATURE = 0.666
95 |
96 | # Agent parameters
97 | problem_statement: str = Field(
98 | ...,
99 | description="The problem or request you want the problem solver agent to solve",
100 | )
101 | previous_agent_runs: List[str] = Field(
102 | default=[],
103 | description="A list of descriptions of previous work undertaken by other agents, the context from which this agent would benefit from knowing. This helps to avoid duplicate work.",
104 | )
105 | requirements: List[str] = Field(
106 | default=[],
107 | description="A list of very specific and low-level criteria which must be met or become valid for the sub-agent to consider its work done.",
108 | )
109 |
110 | def __init__(
111 | self,
112 | parent: BaseAgent | None = None,
113 | workdir: Path | None = None,
114 | logdir: Path | None = None,
115 | debug_mode: bool = False,
116 | **data,
117 | ):
118 | super().__init__(
119 | parent=parent, workdir=workdir, logdir=logdir, debug_mode=debug_mode, **data
120 | )
121 |
122 | async def construct_core_prompt(self) -> str:
123 | """Construct the core prompt for problem solving."""
124 |
125 | # initial_request = await get_problem_statement()
126 | # if initial_request is None or initial_request == "":
127 | # raise ValueError(
128 | # "The initial request was not provided to the problem solver"
129 | # )
130 |
131 | prompt = f"""Here is the problem you have been asked to solve:
132 |
133 |
134 | {self.problem_statement}
135 |
136 | """
137 |
138 | if self.previous_agent_runs:
139 | prompt += "\n\nWork Previously Completed:"
140 | prompt += "\nYou should pay attention to this list to avoid duplicating work. Also note that this list is for work completed by other agents, which aren't 100% reliable, so treat claims with appropriate caution, and verify accordingly."
141 | for work in self.previous_agent_runs:
142 | prompt += f"\n- {work}"
143 |
144 | if self.requirements:
145 | prompt += "\n\nSpecific requirements which must be met before you can consider the work 'done':"
146 | for req in self.requirements:
147 | prompt += f"\n- {req}"
148 |
149 | prompt += "\n\nReturn your answer when complete."
150 |
151 | return prompt
152 |
153 | @classmethod
154 | def generate_examples(cls) -> list[tuple["BaseAgent", AgentResult]]:
155 | """Generate example uses of the tool with their expected outputs."""
156 | examples = [
157 | # Example 1: Mathematical Problem Solving
158 | (
159 | cls(
160 | problem_statement="""Solve the following system of equations:
161 | 3x + 2y = 12
162 | x - y = 1""",
163 | requirements=[
164 | "Show the full answer derivation",
165 | "Verify the solution numerically using Python",
166 | ],
167 | ),
168 | AgentResult(
169 | agent_name=cls.AGENT_NAME,
170 | status=AgentStatus.SUCCESS,
171 | result="""Solution found: x = 4, y = 3
172 |
173 | Process:
174 | 1. Used elimination method
175 | 2. Verified by substitution in a Python script
176 | 3. Checked both equations
177 | 4. All validation criteria met""",
178 | metrics=make_random_agent_metrics(
179 | tools_enabled=True, agents_enabled=True
180 | ),
181 | ),
182 | ),
183 | # Example 2: Code Analysis and Modification
184 | # (
185 | # cls(
186 | # problem_statement="""Fix the performance issue in process_data() agent:
187 | #
188 | # - Current implementation uses O(n²) time
189 | # - Need to optimize to O(n) complexity
190 | # - Maintain existing API contract""",
191 | # requirements=[
192 | # "Keep existing agent signature",
193 | # "Maintain thread safety",
194 | # "Add performance tests",
195 | # ],
196 | # ),
197 | # AgentResult(
198 | # agent_name=cls.AGENT_NAME,
199 | # status=AgentStatus.SUCCESS,
200 | # result="""Optimized process_data() agent:
201 | #
202 | # 1. Analyzed existing implementation
203 | # 2. Identified quadratic loop pattern
204 | # 3. Refactored to use hash table
205 | # 4. Added performance tests
206 | # 5. Verified thread safety
207 | # 6. Maintained API compatibility
208 | #
209 | # Performance improved:
210 | # - Before: O(n²) time, O(1) space
211 | # - After: O(n) time, O(n) space
212 | # - Verified with test suite""",
213 | # metrics=make_random_agent_metrics(
214 | # tools_enabled=True, agents_enabled=True
215 | # ),
216 | # ),
217 | # ),
218 | ]
219 | return examples
220 |
--------------------------------------------------------------------------------
/base_agent/src/agents/implementations/review_committee_member.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from pathlib import Path
7 | from pydantic import Field
8 |
9 |
10 | from ..base_agent import BaseAgent
11 | from ...config import settings
12 | from ...tools.directory_tools import ViewDirectory
13 | from ...tools.file_tools import OpenFile, CloseFile
14 | from ...tools.ripgrep_tool import RipGrepTool
15 | from ...utils.metrics import make_random_agent_metrics
16 | from ...types.agent_types import AgentStatus, AgentResult
17 | from ...types.llm_types import Model
18 | from .reasoner import ReasoningAgent
19 |
20 |
21 | class CommitteeMember(BaseAgent):
22 | """
23 | A simple review committee agent, with read-only access to the project.
24 | """
25 |
26 | AGENT_NAME = "meta_agent_design_reviewer"
27 |
28 | AGENT_DESCRIPTION = """A meta-agent design review committee member. Called from the committee_design tool."""
29 |
30 | SYSTEM_PROMPT = """You are a member of a Meta-Agent design review committee, tasked with evaluating a coding agent's design proposal, about how to improve a coding agent system, before it begins work on the implementation. Your role is to provide a detailed, constructive and reasonable critique that ensures the proposed design avoids commonly identified pathologies in coding agent design, and is robust, practical, and aligned with the goals of the self-improving coding agent.
31 |
32 | Approach the review with a critical yet collaborative mindset, drawing on established engineering principles such as simplicity (delete unnecessary parts), conceptual integrity (a cohesive whole), and testability.
33 |
34 | You must ensure that the design is grounded in making the coding agent system better at writing softare, advocating for things like
35 | - improving the mechanics of writing the code files: more efficient file editing strategies and tools
36 | - building reasoning and organisational structures which guide the agent to generate better code
37 | - things which improve the speed with which the agent is able to complete code tasks
38 | - features which improve the quality of the written code: such as improving the generated code's formatting and structure, utillities for robust and efficient testing, or which enhance the maintainability of the code
39 |
40 | Focus on the following desiderata:
41 | - Clarity: Is the proposal understandable and well-articulated?
42 | - Feasibility: Can it be realistically implemented given constraints?
43 | - Robustness: Does it handle real-world challenges (e.g., edge cases, failures)?
44 | - Quality: Does it reflect good design and testing practices for long-term value?
45 | - Grounding: Is it supported by executable feedback (e.g., tests) to verify its claims?
46 |
47 | Provide a structured evaluation: identify strengths, flag weaknesses, and suggest actionable improvements. Avoid vague or frivolous feedback-every critique should tie back to the project's success. Your specialized role will guide your focus, but always consider the proposal as a whole."""
48 |
49 | # Available tools
50 | # NOTE: ExitAgent and ReturnResult are automatically included
51 | # We limit ourselves to 'read only' tools.
52 | AVAILABLE_TOOLS = {
53 | ViewDirectory,
54 | OpenFile,
55 | CloseFile,
56 | RipGrepTool,
57 | }
58 |
59 | # Available agents
60 | # AVAILABLE_AGENTS = {ReasoningAgent}
61 | AVAILABLE_AGENTS = set()
62 |
63 | HAS_FILEVIEW = True
64 |
65 | MODEL = settings.MODEL
66 | TEMPERATURE = 0.666
67 |
68 | # Agent parameters
69 | proposal: str = Field(
70 | ...,
71 | description="The full proposal to review",
72 | )
73 | context: str = Field(
74 | ...,
75 | description="The motivation and context for understanding the plan",
76 | )
77 | specialisation: str = Field(
78 | ..., description="The specialisation of this committee member"
79 | )
80 | model: Model = Field(default=Model.SONNET_35)
81 |
82 | def __init__(
83 | self,
84 | parent: BaseAgent | None = None,
85 | workdir: Path | None = None,
86 | logdir: Path | None = None,
87 | debug_mode: bool = False,
88 | **data,
89 | ):
90 | super().__init__(
91 | parent=parent, workdir=workdir, logdir=logdir, debug_mode=debug_mode, **data
92 | )
93 |
94 | async def construct_core_prompt(self) -> str:
95 | """Construct the core prompt for the committee member."""
96 |
97 | prompt = f"""{self.specialisation}
98 |
99 | Here is the agent's self-provided goals and context surrounding the plan
100 |
101 | {self.context}
102 |
103 |
104 | Here is the design proposal you have been asked to review:
105 |
106 |
107 | {self.proposal}
108 |
109 |
110 | You should read the README.md file first to get the full context of this self-improving coding agent project.
111 | You should then view the agent_change_log.md to get an idea of what (if anything) has already been tried by the coding agent as it attempts to improve itself, as measured by the benchmark performance.
112 | You can also quickly view any other code files that you need to get context on the proposal.
113 |
114 | Then, craft your review. Don't spend too long opening other files and doing research. Move swiftly. Note that you MUST provide your full review in the return_result tool since this is how it is communicated back. Anything not put in the return_result tool will not be seen by the agent.
115 |
116 | DO NOT attempt the task yourself, and avoid calling tools unless you absolutely need to. Then, simply provide your review in the return_result tool and complete.
117 | """
118 |
119 | return prompt
120 |
121 | @classmethod
122 | def generate_examples(cls) -> list[tuple["CommitteeMember", AgentResult]]:
123 | """Generate example uses of the tool with their expected outputs.
124 |
125 | Note that the committee member is deterministically invoked (for now)
126 | so these examples won't be used.
127 | """
128 | examples = []
129 | return examples
130 |
--------------------------------------------------------------------------------
/base_agent/src/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from typing import Type
7 | from collections import OrderedDict
8 |
9 | from .base import BaseBenchmark
10 | from .gpqa import GPQABenchmark
11 | from .aime import AIMEBenchmark
12 | from .drop import DROPBenchmark
13 | from .math import MATHBenchmark
14 | from .gsm8k import GSM8KBenchmark
15 | from .gsm_ic import GSMICBenchmark
16 | from .refute import RefuteBenchmark
17 | from .arc_agi import ARCAGIBenchmark
18 | from .humaneval import HumanEvalBenchmark
19 | from .file_editing import FileEditingBenchmark
20 | from .aiq_benchmark import AIQBenchmark
21 | from .livecodebench import LiveCodeBenchmark
22 | from .symbol_location import SymbolLocationBenchmark
23 | from .swebench_verified import SWEBenchBenchmark
24 | from .aiq_project_benchmarks import (
25 | LinalgAIQBenchmark,
26 | CSVParsingAIQBenchmark,
27 | MessagingAppAIQBenchmark,
28 | DistKVStoreAIQBenchmark,
29 | )
30 |
31 | # Important, append new benchmarks to the end of this
32 | benchmark_registry: OrderedDict[str, Type[BaseBenchmark]] = OrderedDict(
33 | [
34 | (GSM8KBenchmark.name, GSM8KBenchmark),
35 | # (DROPBenchmark.name, DROPBenchmark),
36 | # (ARCAGIBenchmark.name, ARCAGIBenchmark),
37 | # (MATHBenchmark.name, MATHBenchmark),
38 | # (GSMICBenchmark.name, GSMICBenchmark),
39 | # (FileEditingBenchmark.name, FileEditingBenchmark),
40 | # (SWEBenchBenchmark.name, SWEBenchBenchmark),
41 | # (HumanEvalBenchmark.name, HumanEvalBenchmark),
42 | # (AIMEBenchmark.name, AIMEBenchmark),
43 | # (GPQABenchmark.name, GPQABenchmark),
44 | # (LiveCodeBenchmark.name, LiveCodeBenchmark),
45 | # (SymbolLocationBenchmark.name, SymbolLocationBenchmark),
46 | # (RefuteBenchmark.name, RefuteBenchmark),
47 | # (AIQBenchmark.name, AIQBenchmark),
48 | # (LinalgAIQBenchmark.name, LinalgAIQBenchmark),
49 | # (CSVParsingAIQBenchmark.name, CSVParsingAIQBenchmark),
50 | # (MessagingAppAIQBenchmark.name, MessagingAppAIQBenchmark),
51 | # (DistKVStoreAIQBenchmark.name, DistKVStoreAIQBenchmark),
52 | ]
53 | )
54 |
--------------------------------------------------------------------------------
/base_agent/src/benchmarks/aime.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import random
7 | import logging
8 |
9 | from pathlib import Path
10 | from datasets import load_dataset
11 | from dataclasses import dataclass
12 |
13 | from .base import BaseBenchmark, Problem
14 |
15 | logging.basicConfig(level=logging.INFO)
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | @dataclass
20 | class AIMEExample:
21 | """A single AIME example."""
22 | problem_id: str # e.g., "2024-I-1"
23 | problem: str
24 | solution: str
25 | answer: int
26 |
27 | @classmethod
28 | def from_raw(cls, example: dict) -> "AIMEExample":
29 | """Create an AIMEExample from a raw dataset example."""
30 | return cls(
31 | problem_id=str(example["ID"]),
32 | problem=example["Problem"].strip(),
33 | solution=example["Solution"].strip(),
34 | answer=int(example["Answer"]) # AIME answers are always integers
35 | )
36 |
37 |
38 | class AIMEBenchmark(BaseBenchmark):
39 | """Benchmark for the American Invitational Mathematics Examination (AIME) 2024 dataset.
40 |
41 | The AIME is a prestigious high school mathematics competition known for its challenging
42 | mathematical problems. All answers in AIME are integers.
43 | """
44 |
45 | name = "aime"
46 |
47 | def __init__(self, seed: int | None = 1, subset_size: int | None = 20):
48 | super().__init__(seed, subset_size)
49 |
50 | # Load dataset from HuggingFace
51 | dataset = load_dataset("Maxwell-Jia/AIME_2024")
52 | self.test_data = [AIMEExample.from_raw(ex) for ex in dataset["train"]] # Dataset only has train split
53 |
54 | # Create randomized subset if requested
55 | if subset_size is not None:
56 | random.seed(seed)
57 | self.test_data = random.sample(self.test_data, subset_size)
58 |
59 | # Convert to Problem instances
60 | self._data = [
61 | Problem(
62 | problem_id=ex.problem_id,
63 | statement=ex.problem,
64 | answer=ex.answer,
65 | answer_discussion=ex.solution,
66 | )
67 | for ex in self.test_data
68 | ]
69 |
70 | @property
71 | def problems(self) -> list[Problem]:
72 | return self._data
73 |
74 | async def score_problem(
75 | self,
76 | problem: Problem,
77 | agent_workdir: str,
78 | agent_answer_dir: str,
79 | container_name: str,
80 | ) -> tuple[float, str | None, str | None]:
81 | """Score the answer to the problem.
82 |
83 | Since AIME answers are always integers, we can do exact matching without
84 | any floating-point comparison.
85 |
86 | Returns:
87 | tuple of:
88 | - score (0.0 or 1.0)
89 | - error message (if any)
90 | - solution discussion
91 | """
92 | try:
93 | answer_path = Path(agent_answer_dir) / "answer.txt"
94 | llm_answer = answer_path.read_text().strip()
95 |
96 | # Clean the answer by removing any commas and whitespace
97 | llm_answer = llm_answer.replace(",", "").replace(" ", "")
98 |
99 | # Convert to integer and compare exactly
100 | try:
101 | answer_int = int(llm_answer)
102 | if answer_int == problem.answer:
103 | return 1.0, None, problem.answer_discussion
104 | return 0.0, None, problem.answer_discussion
105 | except ValueError:
106 | return 0.0, "Answer must be an integer", problem.answer_discussion
107 |
108 | except Exception as e:
109 | logger.debug(f"Error in AIME scoring: {e}")
110 | return 0.0, str(e), problem.answer_discussion
111 |
112 |
113 | if __name__ == "__main__":
114 | import tempfile
115 |
116 | def run_test_case(benchmark: AIMEBenchmark, answer_dir: Path,
117 | ground_truth: int, agent_answer: str, should_pass: bool):
118 | """Helper function to run a single test case"""
119 | print(f"\nTESTING: '{ground_truth}' vs '{agent_answer}' (should_pass={should_pass})")
120 |
121 | # Use first problem as template but override answer
122 | problem = benchmark.problems[0]
123 | problem.answer = ground_truth
124 | problem.answer_discussion = "Test discussion"
125 |
126 | answer_file = answer_dir / "answer.txt"
127 | answer_file.write_text(agent_answer)
128 |
129 | score, error, _ = benchmark.score_problem(
130 | problem, str(answer_dir.parent), str(answer_dir), "test"
131 | )
132 |
133 | assert score == (1.0 if should_pass else 0.0), \
134 | f"Failed: '{ground_truth}' vs '{agent_answer}' got {score}, expected {1.0 if should_pass else 0.0}"
135 | if error:
136 | print(f"Error message: {error}")
137 |
138 | # Create test environment
139 | benchmark = AIMEBenchmark()
140 |
141 | with tempfile.TemporaryDirectory() as tmpdir:
142 | answer_dir = Path(tmpdir) / "answers"
143 | answer_dir.mkdir()
144 |
145 | print("\nTesting basic integer answers...")
146 | test_cases = [
147 | (42, "42", True),
148 | (42, "42.0", False), # Must be exact integer
149 | (1000, "1,000", True), # Allow commas
150 | (1000, "1000", True),
151 | (1000, " 1000 ", True), # Allow whitespace
152 | (42, "abc", False), # Non-numeric
153 | (-123, "-123", True), # Negative numbers
154 | (0, "0", True),
155 | (0, "0.0", False),
156 | (42, "41", False), # Wrong answer
157 | ]
158 | for truth, pred, should_pass in test_cases:
159 | run_test_case(benchmark, answer_dir, truth, pred, should_pass)
160 |
161 | # Test that the dataset loads correctly
162 | print("\nTesting dataset loading...")
163 | assert len(benchmark.problems) > 0, "Dataset should not be empty"
164 | assert all(isinstance(p.answer, int) for p in benchmark.problems), \
165 | "All answers should be integers"
166 | assert all(isinstance(p.problem_id, str) for p in benchmark.problems), \
167 | "All problem IDs should be strings"
168 | assert all(p.problem_id.startswith("2024-") for p in benchmark.problems), \
169 | "All problem IDs should start with 2024-"
170 |
171 | # Test subset functionality
172 | print("\nTesting subset functionality...")
173 | subset_size = 5
174 | benchmark_subset = AIMEBenchmark(seed=42, subset_size=subset_size)
175 | assert len(benchmark_subset.problems) == subset_size, \
176 | f"Subset size should be {subset_size}, got {len(benchmark_subset.problems)}"
177 |
178 | # Test seed reproducibility
179 | print("\nTesting seed reproducibility...")
180 | benchmark_subset1 = AIMEBenchmark(seed=42, subset_size=subset_size)
181 | benchmark_subset2 = AIMEBenchmark(seed=42, subset_size=subset_size)
182 | assert [p.problem_id for p in benchmark_subset1.problems] == \
183 | [p.problem_id for p in benchmark_subset2.problems], \
184 | "Same seed should produce same subset"
185 |
186 | # Test different seeds produce different subsets
187 | benchmark_subset3 = AIMEBenchmark(seed=43, subset_size=subset_size)
188 | assert [p.problem_id for p in benchmark_subset1.problems] != \
189 | [p.problem_id for p in benchmark_subset3.problems], \
190 | "Different seeds should produce different subsets"
191 |
192 | print("\nAll tests passed! ✨")
193 |
--------------------------------------------------------------------------------
/base_agent/src/benchmarks/base.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import jsonlines
7 |
8 | from abc import abstractmethod
9 | from typing import Any, ClassVar
10 | from pathlib import Path
11 | from datetime import datetime
12 | from dataclasses import dataclass, asdict
13 |
14 |
15 | @dataclass
16 | class Problem:
17 | """A single benchmark problem, containing a problem_id, problem statement and answer"""
18 |
19 | problem_id: str
20 | statement: str
21 | answer: Any
22 | answer_discussion: str | None
23 |
24 |
25 | @dataclass
26 | class ProblemResult:
27 | """Complete record of a single problem attempt"""
28 |
29 | problem_id: str
30 | timestamp: str | None = None
31 | score: float | None = None
32 | tokens_used: int | None = None
33 | num_cached_tokens: int | None = None
34 | cost_estimate: float | None = None
35 | wall_time: float | None = None
36 | timed_out: bool = False
37 | cost_threshold_exceeded: bool = False
38 |
39 | def is_complete(self) -> bool:
40 | # Considered complete if it has been scored
41 | return self.score is not None
42 |
43 | def update(self, **kwargs) -> None:
44 | for key, value in kwargs.items():
45 | if hasattr(self, key):
46 | setattr(self, key, value)
47 | else:
48 | raise ValueError(f"Invalid field {key} in ProblemResult update")
49 |
50 |
51 | class BenchmarkTracker:
52 | def __init__(self, results_path: Path):
53 | self.results_path = results_path
54 | self.results: dict[str, ProblemResult] = self._load_or_create()
55 |
56 | def _load_or_create(self) -> dict[str, ProblemResult]:
57 | results = {}
58 | if self.results_path.exists():
59 | with jsonlines.open(self.results_path) as reader:
60 | for line in reader:
61 | results[line["problem_id"]] = ProblemResult(**line)
62 | return results
63 |
64 | def start_problem(self, problem_id: str) -> None:
65 | result = ProblemResult(
66 | problem_id=problem_id, timestamp=datetime.now().isoformat()
67 | )
68 | self.results[problem_id] = result
69 | with jsonlines.open(self.results_path, mode="a") as writer:
70 | writer.write(asdict(result))
71 |
72 | def update_problem(self, problem_id: str, **kwargs) -> None:
73 | if problem_id not in self.results:
74 | raise KeyError(f"Problem {problem_id} not found")
75 |
76 | self.results[problem_id].update(**kwargs)
77 |
78 | # Rewrite the file with updated results
79 | with jsonlines.open(self.results_path, mode="w") as writer:
80 | writer.write_all(asdict(result) for result in self.results.values())
81 |
82 |
83 | class BaseBenchmark:
84 |
85 | name: ClassVar[str]
86 |
87 | def __init__(self, seed: int | None = None, subset_size: int | None = None):
88 | self.problem_idx: int = 0
89 | self.seed = seed
90 | self.subset_size = subset_size
91 |
92 | @property
93 | @abstractmethod
94 | def problems(self) -> list[Problem]:
95 | pass
96 |
97 | @abstractmethod
98 | async def score_problem(
99 | self,
100 | problem: Problem,
101 | agent_workdir: str,
102 | agent_answer_dir: str,
103 | container_name: str,
104 | ) -> tuple[float, str | None, str | None]:
105 | """
106 | Score the answer to the problem; the agent_workdir is an absolute path
107 | to the mapped /home/agent/workdir in the docker container, while the
108 | agent_answer_dir is the absolute path to the mapped logdir in the
109 | docker container, which should contain an answer.txt file.
110 |
111 | To get the submitted answer (if relevant):
112 |
113 | answer_path = Path(agent_answer_dir) / "answer.txt"
114 | llm_answer = answer_path.read_text().strip()
115 |
116 | Return the score (as a float), any parsing errors, and any additional
117 | discussion or information about the answer that can assist the summary.
118 | """
119 | pass
120 |
121 | def get_problem(self, problem_id: str) -> Problem | None:
122 | """Retrieve a specific problem by ID
123 | Overload this method if there is a more efficient way of locating the
124 | problem by problem_id.
125 | """
126 | return next((p for p in self.problems if p.problem_id == problem_id), None)
127 |
128 | async def setup_problem(
129 | self, problem: Problem, problem_data_dir: Path, container_name: str
130 | ) -> None:
131 | """Optional hook for performing problem-specific setup.
132 |
133 | This is called before each problem is run. The problem_data_dir
134 | will be mounted in the agent's container at /home/agent/workdir.
135 |
136 | Args:
137 | problem: The problem being run
138 | problem_data_dir: Path to a temporary directory for problem data.
139 | This directory will be mounted in the agent's container.
140 | container_name: The name of the container that the problem will run in
141 | """
142 | pass # Default no-op implementation
143 |
--------------------------------------------------------------------------------
/base_agent/src/benchmarks/gsm8k.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import re
7 | import random
8 | import logging
9 |
10 | from typing import List
11 | from pathlib import Path
12 | from datasets import load_dataset
13 | from dataclasses import dataclass
14 |
15 | from .base import BaseBenchmark, Problem
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | @dataclass
21 | class GSM8KExample:
22 | """A single GSM8K example."""
23 |
24 | question: str
25 | answer: str
26 | steps: list[str]
27 | final_answer: float
28 |
29 | @classmethod
30 | def from_raw(cls, example: dict) -> "GSM8KExample":
31 | """Create a GSM8KExample from a raw dataset example."""
32 | # Split answer into steps and final answer
33 | answer_parts = example["answer"].split("####")
34 | steps = [s.strip() for s in answer_parts[0].split("\n") if s.strip()]
35 | final_answer = float(answer_parts[1].strip().replace(",", ""))
36 |
37 | return cls(
38 | question=example["question"].strip() + "\n\nWhen submitting your answer, please just give a single number with no accompanying text, units or other markings.",
39 | answer=example["answer"].strip(),
40 | steps=steps,
41 | final_answer=final_answer,
42 | )
43 |
44 | def extract_calculations(self) -> List[tuple[str, float, float]]:
45 | """Extract arithmetic calculations from the solution steps.
46 |
47 | Returns:
48 | List of tuples containing (expression, expected_result, actual_result)
49 | """
50 | calculations = []
51 | pattern = r"<<(.+?)=(.+?)>>"
52 |
53 | for step in self.steps:
54 | matches = re.finditer(pattern, step)
55 | for match in matches:
56 | expr, result = match.groups()
57 | try:
58 | # Clean the expression and make it Python-safe
59 | expr = expr.strip().replace("×", "*").replace("÷", "/")
60 | actual = eval(
61 | expr
62 | ) # Note: eval is safe here as we control the input
63 | expected = float(result)
64 | calculations.append((expr, expected, actual))
65 | except:
66 | continue
67 |
68 | return calculations
69 |
70 |
71 | class GSM8KBenchmark(BaseBenchmark):
72 |
73 | name = "gsm8k"
74 |
75 | def __init__(self, seed: int | None = None, subset_size: int | None = None):
76 | super().__init__(seed, subset_size)
77 |
78 | # Validate inputs
79 | if subset_size is not None and subset_size <= 0:
80 | raise ValueError("subset_size must be positive")
81 |
82 | dataset = load_dataset("openai/gsm8k", "main")
83 | # self.train_data = [GSM8KExample.from_raw(ex) for ex in dataset["train"]]
84 | self.test_data = [GSM8KExample.from_raw(ex) for ex in dataset["test"]]
85 |
86 | self._data = [
87 | Problem(problem_id=str(i), statement=p.question, answer=p.final_answer, answer_discussion="\n".join(p.steps))
88 | for i, p in enumerate(self.test_data)
89 | ]
90 |
91 | # Create randomized subset if requested
92 | if subset_size is not None:
93 | random.seed(seed)
94 | self._data = random.sample(self._data, subset_size)
95 |
96 | @property
97 | def problems(self) -> list[Problem]:
98 | return self._data
99 |
100 | async def score_problem(
101 | self,
102 | problem: Problem,
103 | agent_workdir: str,
104 | agent_answer_dir: str,
105 | container_name: str,
106 | ) -> tuple[float, str | None, str | None]:
107 | try:
108 | answer_path = Path(agent_answer_dir) / "answer.txt"
109 | llm_answer = answer_path.read_text().strip()
110 |
111 | float_answer = float(llm_answer.strip().replace(",", "").replace(" ", ""))
112 | if abs(problem.answer - float_answer) < 1e-7:
113 | return 1.0, None, problem.answer_discussion
114 | else:
115 | return 0.0, None, problem.answer_discussion
116 | except Exception as e:
117 | logger.debug(f"Error in gsm8k scoring: {e}")
118 | return 0.0, str(e), problem.answer_discussion
119 |
--------------------------------------------------------------------------------
/base_agent/src/benchmarks/gsm_ic.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import random
7 | import logging
8 |
9 | from pathlib import Path
10 | from datasets import load_dataset
11 | from dataclasses import dataclass
12 |
13 | from .base import BaseBenchmark, Problem
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | @dataclass
19 | class GSMICExample:
20 | """A single GSM-IC example with irrelevant context."""
21 |
22 | question: str
23 | answer: float
24 | n_steps: int
25 |
26 | @classmethod
27 | def from_raw(cls, example: dict) -> "GSMICExample":
28 | """Create a GSMICExample from a raw dataset example."""
29 | return cls(
30 | question=example["question"].strip(),
31 | answer=float(str(example["answer"]).strip().replace(",", "")),
32 | n_steps=int(example["n_steps"]),
33 | )
34 |
35 |
36 | class GSMICBenchmark(BaseBenchmark):
37 | """Benchmark for the GSM-IC dataset that tests mathematical reasoning with irrelevant context."""
38 |
39 | name = "gsm_ic"
40 |
41 | def __init__(self, seed: int | None = None, subset_size: int | None = None):
42 | """Initialize the GSM-IC benchmark.
43 |
44 | Args:
45 | subset_size: Number of problems to use (default 50 to match GSM8K implementation)
46 | """
47 | super().__init__(seed, subset_size)
48 |
49 | # Validate inputs
50 | if subset_size is not None and subset_size <= 0:
51 | raise ValueError("subset_size must be positive")
52 |
53 | # Load the dataset
54 | dataset = load_dataset("voidful/GSM-IC")
55 | self.data = [GSMICExample.from_raw(ex) for ex in dataset["validation"]]
56 |
57 | # Create problem instances, limiting to subset_size
58 | self._problems = [
59 | Problem(problem_id=str(i), statement=p.question, answer=p.answer, answer_discussion=None)
60 | for i, p in enumerate(self.data)
61 | ]
62 |
63 | # Create randomized subset if requested
64 | if subset_size is not None:
65 | random.seed(seed)
66 | self._problems = random.sample(self._problems, subset_size)
67 |
68 | @property
69 | def problems(self) -> list[Problem]:
70 | """Return the list of problems."""
71 | return self._problems
72 |
73 | async def score_problem(
74 | self,
75 | problem: Problem,
76 | agent_workdir: str,
77 | agent_answer_dir: str,
78 | container_name: str,
79 | ) -> tuple[float, str | None, str | None]:
80 | """Score an answer from the LLM against the ground truth.
81 |
82 | Args:
83 | problem: Problem instance containing the ground truth
84 | llm_answer: Answer string from the LLM
85 |
86 | Returns:
87 | 1.0 if the answer is correct, 0.0 otherwise
88 | """
89 | try:
90 | answer_path = Path(agent_answer_dir) / "answer.txt"
91 | llm_answer = answer_path.read_text().strip()
92 |
93 | # Clean and convert llm answer to float
94 | float_answer = float(llm_answer.strip().replace(",", "").replace(" ", ""))
95 |
96 | # Compare with small tolerance
97 | if abs(problem.answer - float_answer) < 1e-7:
98 | return 1.0, None, None
99 | return 0.0, None, None
100 |
101 | except Exception as e:
102 | logger.debug(f"Error in GSM-IC scoring: {e}")
103 | return 0.0, str(e), None
104 |
--------------------------------------------------------------------------------
/base_agent/src/callgraph/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | Call graph tracking and oversight for agent functions and tools.
8 |
9 | This module provides:
10 | - Graph data structures for tracking execution
11 | - Visualization utilities
12 | """
13 |
--------------------------------------------------------------------------------
/base_agent/src/callgraph/digraph.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Core directed graph implementation for tracking sub-agent calls.
7 |
8 | Note, in this module, the terms "agent" and "function" are used interchangeably.
9 | """
10 |
11 | from typing import Dict, Set, List, Optional, Iterator
12 | from datetime import datetime
13 | from dataclasses import dataclass, field
14 |
15 |
16 | @dataclass
17 | class FunctionNode:
18 | """
19 | Represents a function execution in the call graph.
20 |
21 | This tracks the essential metadata about a function execution,
22 | including timing, results, and relationships to other functions.
23 | """
24 |
25 | # Core identity
26 | id: str
27 | name: str
28 | parent_id: Optional[str] = None
29 | children: Set[str] = field(default_factory=set)
30 |
31 | # Execution state
32 | started_at: Optional[datetime] = None
33 | completed_at: Optional[datetime] = None
34 | success: bool | None = None
35 | error: Optional[str] = None
36 |
37 | # Function-specific data
38 | args: Dict = field(default_factory=dict)
39 | result: Optional[str] = None
40 |
41 | # Metrics
42 | token_count: int = 0
43 | num_cached_tokens: int = 0
44 | cost: float = 0.0
45 |
46 | @property
47 | def duration_seconds(self) -> Optional[float]:
48 | """Calculate execution duration if completed."""
49 | if self.completed_at and self.started_at:
50 | return (self.completed_at - self.started_at).total_seconds()
51 | return None
52 |
53 |
54 | class CallGraph:
55 | """
56 | Directed graph tracking function calls / agent calls.
57 |
58 | The graph maintains parent-child relationships between function
59 | calls and tracks execution metrics for each function.
60 | """
61 |
62 | def __init__(self):
63 | self.nodes: Dict[str, FunctionNode] = {}
64 | self._root_id: Optional[str] = None
65 |
66 | @property
67 | def root(self) -> Optional[FunctionNode]:
68 | """Get the root node if it exists."""
69 | return self.nodes.get(self._root_id) if self._root_id else None
70 |
71 | def add_node(self, node: FunctionNode) -> None:
72 | """
73 | Add a node to the graph.
74 |
75 | If this is the first node, it becomes the root.
76 | """
77 | self.nodes[node.id] = node
78 | if not self._root_id:
79 | self._root_id = node.id
80 |
81 | def get_node(self, node_id: str) -> Optional[FunctionNode]:
82 | """Get a node by ID."""
83 | return self.nodes.get(node_id)
84 |
85 | def add_edge(self, from_id: str, to_id: str) -> None:
86 | """Add a directed edge between nodes."""
87 | if from_id not in self.nodes or to_id not in self.nodes:
88 | raise ValueError("Both nodes must exist in the graph")
89 |
90 | self.nodes[from_id].children.add(to_id)
91 | self.nodes[to_id].parent_id = from_id
92 |
93 | def get_children(self, node_id: str) -> List[FunctionNode]:
94 | """Get all child nodes of a given node."""
95 | node = self.nodes.get(node_id)
96 | if not node:
97 | return []
98 | return [self.nodes[child_id] for child_id in node.children]
99 |
100 | def get_ancestors(self, node_id: str) -> List[FunctionNode]:
101 | """Get all ancestors of a node (parent, parent's parent, etc)."""
102 | ancestors = []
103 | current = self.nodes.get(node_id)
104 | while current and current.parent_id:
105 | parent = self.nodes.get(current.parent_id)
106 | if parent:
107 | ancestors.append(parent)
108 | current = parent
109 | else:
110 | break
111 | return ancestors
112 |
113 | def get_subtree(self, root_id: str) -> Set[str]:
114 | """Get all node IDs in the subtree rooted at root_id."""
115 | subtree = {root_id}
116 | node = self.nodes.get(root_id)
117 | if node:
118 | for child_id in node.children:
119 | subtree.update(self.get_subtree(child_id))
120 | return subtree
121 |
122 | def remove_subtree(self, root_id: str) -> None:
123 | """Remove a node and its entire subtree."""
124 | subtree = self.get_subtree(root_id)
125 | for node_id in subtree:
126 | node = self.nodes.pop(node_id, None)
127 | if node and node.parent_id:
128 | parent = self.nodes.get(node.parent_id)
129 | if parent:
130 | parent.children.remove(node_id)
131 |
132 | def iter_bfs(self) -> Iterator[FunctionNode]:
133 | """Iterate through nodes in breadth-first order."""
134 | if not self._root_id:
135 | return
136 |
137 | visited = set()
138 | queue = [self._root_id]
139 |
140 | while queue:
141 | node_id = queue.pop(0)
142 | if node_id not in visited:
143 | visited.add(node_id)
144 | node = self.nodes.get(node_id)
145 | if node:
146 | yield node
147 | queue.extend(node.children)
148 |
149 | def iter_dfs(self) -> Iterator[FunctionNode]:
150 | """Iterate through nodes in depth-first order."""
151 | if not self._root_id:
152 | return
153 |
154 | visited = set()
155 |
156 | def dfs(node_id: str) -> Iterator[FunctionNode]:
157 | if node_id not in visited:
158 | visited.add(node_id)
159 | node = self.nodes.get(node_id)
160 | if node:
161 | yield node
162 | for child_id in node.children:
163 | yield from dfs(child_id)
164 |
165 | yield from dfs(self._root_id)
166 |
167 | def find_cycles(self) -> List[List[str]]:
168 | """Find any cycles in the graph."""
169 | cycles = []
170 | visited = set()
171 | path = []
172 | path_set = set()
173 |
174 | def dfs(node_id: str) -> None:
175 | if node_id in path_set:
176 | cycle_start = path.index(node_id)
177 | cycles.append(path[cycle_start:] + [node_id])
178 | return
179 |
180 | if node_id in visited:
181 | return
182 |
183 | visited.add(node_id)
184 | path.append(node_id)
185 | path_set.add(node_id)
186 |
187 | node = self.nodes.get(node_id)
188 | if node:
189 | for child_id in node.children:
190 | dfs(child_id)
191 |
192 | path.pop()
193 | path_set.remove(node_id)
194 |
195 | if self._root_id:
196 | dfs(self._root_id)
197 |
198 | return cycles
199 |
200 | def get_execution_metrics(self) -> Dict:
201 | """Get overall execution metrics."""
202 | total_tokens = sum(n.token_count for n in self.nodes.values())
203 | num_cached_tokens = sum(n.num_cached_tokens for n in self.nodes.values())
204 | total_cost = sum(n.cost for n in self.nodes.values())
205 |
206 | complete_nodes = [
207 | n for n in self.nodes.values() if n.started_at and n.completed_at
208 | ]
209 |
210 | total_duration = (
211 | sum(
212 | n.duration_seconds
213 | for n in complete_nodes
214 | if n.duration_seconds is not None
215 | )
216 | if complete_nodes
217 | else 0
218 | )
219 |
220 | successes = sum(1 for n in self.nodes.values() if n.success)
221 | failures = sum(1 for n in self.nodes.values() if not n.success)
222 |
223 | return {
224 | "total_functions": len(self.nodes),
225 | "total_tokens": total_tokens,
226 | "num_cached_tokens": num_cached_tokens,
227 | "total_cost": total_cost,
228 | "total_duration": total_duration,
229 | "successful_calls": successes,
230 | "failed_calls": failures,
231 | }
232 |
--------------------------------------------------------------------------------
/base_agent/src/config.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from pydantic import field_validator
7 | from pydantic_settings import BaseSettings
8 |
9 | from .types.llm_types import Model
10 |
11 |
12 | class Settings(BaseSettings):
13 | # Basic Agent Configuration
14 | NAME: str = "self_referential_agent"
15 | LOG_LEVEL: str = "INFO"
16 |
17 | MODEL: Model = Model.SONNET_37
18 | REASONING_MODEL: Model = Model.O3_MINI
19 | OVERSIGHT_MODEL: Model = Model.SONNET_37
20 |
21 | @field_validator("MODEL", "REASONING_MODEL", "OVERSIGHT_MODEL", mode="before")
22 | def parse_model(cls, value):
23 | """Convert a string model name into a Model enum instance."""
24 | if isinstance(value, str):
25 | return Model.from_name(value)
26 | elif isinstance(value, Model):
27 | return value
28 | raise ValueError(f"Invalid model value: {value!r}")
29 |
30 | model_config = {
31 | "env_prefix": "AGENT_",
32 | "case_sensitive": True,
33 | "extra": "allow", # Allow extra fields from environment
34 | }
35 |
36 |
37 | settings = Settings()
38 |
--------------------------------------------------------------------------------
/base_agent/src/events/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from .event_bus import EventBus
7 |
8 | __all__ = ["EventBus"]
9 |
--------------------------------------------------------------------------------
/base_agent/src/events/event_bus_utils.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Utility functions for working with the event bus.
7 |
8 | Note: these are entirely un-optimised, and many require inefficient iterations
9 | over the event lists to reconstruct 'views' on the event bus. For SWE bench
10 | style tasks, with only up to hundreds of messages, this is not important.
11 | """
12 |
13 | from typing import Optional, Set, List
14 |
15 | from .event_bus import EventBus
16 | from ..types.tool_types import ToolResult
17 | from ..types.agent_types import AgentResult
18 | from ..types.event_types import EventType, Event, FileOperation, FileEvent
19 |
20 |
21 | async def log_to_stdout(event: Event | FileEvent):
22 | """Print important events to stdout with clear formatting.
23 |
24 | This function is important to format since it is the visual feedback
25 | that the meta-agent will get back when test-running itself
26 | """
27 |
28 | # Common formatting constants
29 | max_content_len = 50 # Reduced to allow for richer metadata
30 | prefix_width = 10
31 |
32 | def truncate(text: str, length: int = max_content_len) -> str:
33 | """Helper to truncate text and handle newlines"""
34 | text = text.replace("\n", " ")
35 | return f"{text[:length]}..." if len(text) > length else text
36 |
37 | def format_output(prefix: str, content: str, metadata: str = "") -> None:
38 | """Helper to format and print consistent output"""
39 | print(
40 | f"{prefix:<{prefix_width}s} => {content}{' | ' + metadata if metadata else ''}"
41 | )
42 |
43 | event_content = truncate(str(event.content))
44 |
45 | if event.type in (EventType.CORE_PROMPT_UPDATE, EventType.SYSTEM_PROMPT_UPDATE):
46 | return
47 | elif event.type == EventType.ASSISTANT_MESSAGE:
48 | format_output(event.type.value, event_content)
49 | elif event.type == EventType.TOOL_CALL:
50 | name = event.metadata.get("name", "unknown tool")
51 | args = truncate(str(event.metadata.get("args", {})))
52 | format_output(event.type.value, f"{name}, {args}")
53 | elif event.type == EventType.TOOL_RESULT:
54 | result = event.metadata.get("tool_result")
55 | if not isinstance(result, ToolResult):
56 | return
57 | content = f"{result.tool_name}, success: {result.success}, "
58 | content += f"duration: {result.duration:.1f}, {event_content} "
59 | format_output(event.type.value, content)
60 | elif event.type == EventType.AGENT_CALL:
61 | name = event.metadata.get("name", "unknown agent")
62 | args = truncate(str(event.metadata.get("args", {})))
63 | format_output(event.type.value, f"{name}, {args}")
64 | elif event.type == EventType.AGENT_RESULT:
65 | result = event.metadata.get("agent_result")
66 | if not isinstance(result, AgentResult):
67 | return
68 | name = result.agent_name
69 | status = result.status.value
70 | duration = result.metrics.duration_seconds or 0.0
71 | cost = result.metrics.cost
72 | res = truncate(result.result, 20)
73 | content = f"{name}, status: {status}, duration: {duration:.1f}, cost: ${cost:.4f}, {res}"
74 | format_output(event.type.value, content)
75 | else:
76 | format_output(event.type.value, event_content)
77 |
78 |
79 | async def get_problem_statement() -> str:
80 | """Get the initial problem statement."""
81 | event_bus = await EventBus.get_instance()
82 | # There should only be one, but we handle the case when it was updated somehow
83 | ps_events = event_bus.get_events_by_type(EventType.PROBLEM_STATEMENT)
84 | return "\n".join(ps.content for ps in ps_events) if len(ps_events) else ""
85 |
86 |
87 | async def get_budget_info() -> dict[str, int | float | None]:
88 | """Get the initial problem statement."""
89 | event_bus = await EventBus.get_instance()
90 | # There should only be one, but we handle the case when it was updated somehow
91 | ps_events = event_bus.get_events_by_type(EventType.BUDGET_INFO)
92 | if ps_events:
93 | return ps_events[-1].metadata
94 | else:
95 | return dict()
96 |
97 | async def get_latest_sys_prompt_event(agent_id: str | None = None) -> Optional[Event]:
98 | """Get the latest system prompt update event."""
99 | event_bus = await EventBus.get_instance()
100 | events = (
101 | event_bus.get_events_by_type(EventType.SYSTEM_PROMPT_UPDATE)
102 | if not agent_id
103 | else event_bus.get_events(agent_id)
104 | )
105 | system_prompts = [e for e in events if e.type == EventType.SYSTEM_PROMPT_UPDATE]
106 | return system_prompts[-1] if system_prompts else None
107 |
108 |
109 | async def get_latest_core_prompt_event(agent_id: str | None = None) -> Optional[Event]:
110 | """Get the latest core prompt update event."""
111 | event_bus = await EventBus.get_instance()
112 | events = (
113 | event_bus.get_events_by_type(EventType.CORE_PROMPT_UPDATE)
114 | if not agent_id
115 | else event_bus.get_events(agent_id)
116 | )
117 | core_prompts = [e for e in events if e.type == EventType.CORE_PROMPT_UPDATE]
118 | return core_prompts[-1] if core_prompts else None
119 |
120 |
121 | async def get_open_file_set(agent_id: str | None = None) -> Set[FileEvent]:
122 | """Get the set of currently open files."""
123 | event_bus = await EventBus.get_instance()
124 | open_files: dict[str, FileEvent] = {}
125 | events = (
126 | event_bus.get_events_by_type(EventType.FILE_EVENT)
127 | if not agent_id
128 | else [
129 | e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT
130 | ]
131 | )
132 |
133 | for event in events:
134 | if isinstance(event, FileEvent):
135 | if event.operation == FileOperation.CLOSE and event.path in open_files:
136 | open_files.pop(event.path)
137 | elif event.operation == FileOperation.OPEN:
138 | open_files[event.path] = event
139 | return set(open_files.values())
140 |
141 |
142 | async def is_file_open(file_path: str, agent_id: str | None = None) -> bool:
143 | """Check if a specific file is open."""
144 | open_files = await get_open_file_set(agent_id)
145 | return any(file_event.path == file_path for file_event in open_files)
146 |
147 |
148 | async def get_latest_file_event(
149 | file_path: str,
150 | agent_id: str | None = None,
151 | exclude_close: bool = False,
152 | ) -> Optional[FileEvent]:
153 | """Get the most recent file event for a given path."""
154 | event_bus = await EventBus.get_instance()
155 | events = (
156 | event_bus.get_events_by_type(EventType.FILE_EVENT)
157 | if not agent_id
158 | else [
159 | e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT
160 | ]
161 | )
162 |
163 | file_events = [
164 | e
165 | for e in events
166 | if isinstance(e, FileEvent)
167 | and e.path == file_path
168 | and (e.operation != FileOperation.CLOSE if exclude_close else True)
169 | ]
170 | return file_events[-1] if file_events else None
171 |
172 |
173 | async def get_file_content_size(agent_id: str | None = None) -> int:
174 | """Calculate total size of content from file events."""
175 | event_bus = await EventBus.get_instance()
176 | total_size = 0
177 | events = (
178 | event_bus.get_events_by_type(EventType.FILE_EVENT)
179 | if not agent_id
180 | else [
181 | e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT
182 | ]
183 | )
184 |
185 | for event in events:
186 | if isinstance(event, FileEvent):
187 | total_size += len(event.content.encode("utf-8"))
188 | return total_size
189 |
190 |
191 | async def get_subagent_events(
192 | agent_id: str,
193 | event_types: Set[EventType] = set(EventType),
194 | # event_types: Set[EventType] = {
195 | # EventType.ASSISTANT_MESSAGE,
196 | # EventType.TOOL_RESULT,
197 | # EventType.AGENT_RESULT,
198 | # EventType.FILE_EVENT,
199 | # EventType.EXTERNAL_MESSAGE,
200 | # },
201 | ) -> List[Event]:
202 | """Get events for prefilling assistant messages."""
203 | event_bus = await EventBus.get_instance()
204 | all_events = event_bus.get_events_in_chain(agent_id)
205 | return [e for e in all_events if e.type in event_types]
206 |
--------------------------------------------------------------------------------
/base_agent/src/llm/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """LLM integration module for the Self-Referential Agent System.
7 |
8 | This module provides a unified interface for interacting with various LLM providers
9 | including Anthropic, OpenAI, and DeepSeek.
10 | """
11 |
12 | import logging
13 |
14 | from .base import (
15 | Message,
16 | Completion,
17 | CompletionChunk,
18 | TimingInfo,
19 | TextContent,
20 | ToolResultContent,
21 | )
22 | from .api import create_completion, create_streaming_completion
23 | from .metering import token_meter, get_total_cost
24 |
25 | # Quieten LLM API call logs to make stdout more useful
26 | logging.getLogger("httpx").setLevel(logging.WARNING)
27 |
28 | __all__ = [
29 | "Message",
30 | "Completion",
31 | "CompletionChunk",
32 | "TimingInfo",
33 | "create_completion",
34 | "create_streaming_completion",
35 | "token_meter",
36 | "get_total_cost",
37 | "TextContent",
38 | "ToolResultContent",
39 | ]
40 |
--------------------------------------------------------------------------------
/base_agent/src/llm/base.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Base models and shared functionality for LLM interactions."""
7 |
8 | from typing import Dict, Optional
9 | from datetime import datetime, timedelta
10 | from pydantic import BaseModel, Field
11 |
12 | from ..types.llm_types import TokenUsage, Model, StopReason, TextContent, ReasoningContent, ToolCallContent, ToolResultContent, ContentTypes
13 |
14 | # NOTE: perhaps move the rest of these classes to the llm_types for consistency
15 |
16 |
17 | class Message(BaseModel):
18 | """A message in a conversation with an LLM."""
19 |
20 | role: str
21 | content: list[ContentTypes]
22 | name: Optional[str] = None
23 |
24 | def __str__(self) -> str:
25 | parts = [f"Message from role={self.role}"]
26 | for c in self.content:
27 | if isinstance(c, TextContent):
28 | parts.append(f"Text {'-'*10}\n{c.text}")
29 | elif isinstance(c, ReasoningContent):
30 | parts.append(f"Reasoning {'-'*10}\n{c.text}")
31 | elif isinstance(c, ToolCallContent):
32 | parts.append(f"{'-'*10}\nTool call {c.tool_name} (id: {c.call_id}) {c.call_type}: {str(c.tool_args)}\n{'-'*10}")
33 | elif isinstance(c, ToolResultContent):
34 | parts.append(f"{'-'*10}\nTool result {c.tool_name} (id: {c.call_id}): {c.content}\n{'-'*10}")
35 | # return "\n".join([p.replace("\n", "").strip() for p in parts])
36 | return "\n".join(parts)
37 |
38 |
39 | class TimingInfo(BaseModel):
40 | """Timing information for LLM interactions."""
41 |
42 | start_time: datetime = Field(description="When the request started")
43 | end_time: datetime = Field(description="When the response completed")
44 | total_duration: timedelta = Field(description="Total duration of the request")
45 | first_token_time: Optional[datetime] = Field(
46 | None, description="When the first token was received"
47 | )
48 | time_to_first_token: Optional[float] = Field(
49 | None, description="Duration until first token received"
50 | )
51 | tokens_per_second: Optional[float] = Field(
52 | None, description="Average tokens per second for completion"
53 | )
54 |
55 | def __str__(self) -> str:
56 | # Format datetime fields to a readable format
57 | fmt = "%Y-%m-%d %H:%M:%S"
58 | parts = [
59 | f"- Start {self.start_time.strftime(fmt)}, End {self.end_time.strftime(fmt)}",
60 | f"- Duration: {self.total_duration}",
61 | ]
62 | if self.time_to_first_token is not None:
63 | parts.append(f"- TTFT: {self.time_to_first_token:.2f} sec")
64 | if self.tokens_per_second is not None:
65 | parts.append(f"- TPS: {self.tokens_per_second:.2f}")
66 | return "\n".join(parts)
67 |
68 | class CacheMetrics(BaseModel):
69 | """Cache-related metrics."""
70 |
71 | cache_hits: int = Field(default=0, description="Number of cache hits")
72 | cache_misses: int = Field(default=0, description="Number of cache misses")
73 | cache_writes: int = Field(default=0, description="Number of cache writes")
74 |
75 | @classmethod
76 | def from_dict(cls, data: Optional[Dict[str, int]] = None) -> "CacheMetrics":
77 | """Create metrics from dictionary, preserving provider values."""
78 | if data is None:
79 | data = {"cache_hits": 0, "cache_misses": 0, "cache_writes": 0}
80 | return cls(
81 | cache_hits=data.get("cache_hits", 0),
82 | cache_misses=data.get("cache_misses", 0),
83 | cache_writes=data.get("cache_writes", 0),
84 | )
85 |
86 | def to_dict(self) -> Dict[str, int]:
87 | """Convert to dictionary."""
88 | return self.model_dump() # Use model_dump instead of dict
89 |
90 |
91 | # Completion Types ============================================================
92 |
93 | class Completion(BaseModel):
94 | """A completion response from an LLM."""
95 |
96 | id: str
97 | content: list[ContentTypes] | list[list[ContentTypes]]
98 | model: Model # Model identifier string
99 | usage: TokenUsage
100 | timing: TimingInfo
101 | cache_metrics: Optional[Dict[str, int]] = None
102 | stop_reason: StopReason | list[StopReason] = StopReason.COMPLETE
103 | stop_sequence: Optional[str] | list[StopReason] = None
104 | continuation_count: Optional[int] = None
105 | raw_response: Optional[Dict] = Field(default=None, exclude=True)
106 |
107 | @property
108 | def finished_early(self) -> bool:
109 | """Check if completion stopped before finishing normally."""
110 | return self.stop_reason != StopReason.COMPLETE
111 |
112 | @property
113 | def hit_token_limit(self) -> bool:
114 | """Check if completion stopped due to token length."""
115 | return self.stop_reason == StopReason.LENGTH
116 |
117 | @property
118 | def errored(self) -> bool:
119 | """Check if completion encountered an error."""
120 | return self.stop_reason == StopReason.ERROR
121 |
122 | def get_cache_metric(self, key: str, default: int = 0) -> int:
123 | """Get a cache metric value safely."""
124 | if self.cache_metrics is None:
125 | return default
126 | return self.cache_metrics.get(key, default)
127 |
128 | def calculate_cost(self) -> float:
129 | """Calculate the cost for this completion."""
130 | return self.usage.calculate_cost(self.model.token_cost)
131 |
132 | def __str__(self) -> str:
133 | comp_str = f"{'='*80}\n"
134 | if isinstance(self.content[0], list):
135 | for i, completion in enumerate(self.content):
136 | comp_str += f"Candidate {i:03d} {70*'-'}\n"
137 | for block in completion:
138 | comp_str += str(block) + "\n"
139 | else:
140 | for block in self.content:
141 | comp_str += str(block) + "\n"
142 | comp_str += f"\n{'-'*80}\n"
143 | comp_str += f"Model: {self.model.id}\n"
144 | comp_str += f"""Tokens used:
145 | - Input {self.usage.input_tokens} (cached: {self.usage.cached_prompt_tokens}, written to cache: {self.usage.cache_write_prompt_tokens})
146 | - Completion {self.usage.completion_tokens}
147 | - Total {self.usage.total_tokens}
148 | """
149 | if self.stop_reason != StopReason.COMPLETE:
150 | comp_str += f"Stop reason: {self.stop_reason}\n"
151 | if self.stop_sequence:
152 | comp_str += f"Stop sequence: {self.stop_sequence}\n"
153 |
154 | if self.continuation_count:
155 | comp_str += f"Continuations: {self.continuation_count}\n"
156 |
157 | if self.timing:
158 | comp_str += f"Timing:\n{self.timing}\n"
159 |
160 | comp_str += f"Cost: ${self.calculate_cost():.6f}\n"
161 |
162 | comp_str += f"{'='*80}\n"
163 | return comp_str
164 |
165 |
166 | class CompletionChunk(BaseModel):
167 | """A streaming chunk of a completion response."""
168 |
169 | id: str
170 | content: str # TODO: make tool call or assistant message string
171 | model: Model # Model identifier string
172 | is_finished: bool = False
173 | timing: Optional[TimingInfo] = None
174 | usage: Optional[TokenUsage] = None
175 | cache_metrics: Optional[Dict[str, int]] = None
176 | stop_reason: Optional[StopReason] = None
177 | continuation_count: Optional[int] = None
178 | raw_response: Optional[Dict] = Field(default=None, exclude=True)
179 |
180 | @property
181 | def finished_early(self) -> bool:
182 | """Check if completion stopped before finishing normally."""
183 | return bool(self.stop_reason and self.stop_reason != StopReason.COMPLETE)
184 |
185 | @property
186 | def hit_token_limit(self) -> bool:
187 | """Check if completion stopped due to token length."""
188 | return bool(self.stop_reason and self.stop_reason == StopReason.LENGTH)
189 |
190 | @property
191 | def errored(self) -> bool:
192 | """Check if completion encountered an error."""
193 | return bool(self.stop_reason and self.stop_reason == StopReason.ERROR)
194 |
195 | def get_cache_metric(self, key: str, default: int = 0) -> int:
196 | """Get a cache metric value safely."""
197 | if self.cache_metrics is None:
198 | return default
199 | return self.cache_metrics.get(key, default)
200 |
201 | def model_dump(self, **kwargs) -> Dict:
202 | """Override model_dump to exclude raw_response by default."""
203 | kwargs.setdefault("exclude", {"raw_response"})
204 | return super().model_dump(**kwargs)
205 |
--------------------------------------------------------------------------------
/base_agent/src/llm/metering.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from typing import DefaultDict
7 | from collections import defaultdict
8 |
9 | from ..types.llm_types import TokenUsage, Model
10 |
11 | # A mapping from models to token usage and dollar cost
12 | token_meter: DefaultDict[Model, TokenUsage] = defaultdict(TokenUsage)
13 | budget_info: dict[str, None | int | float] = dict(
14 | start_time=None, # start timestamp
15 | cost_budget=None, # cost budget in USD
16 | time_budget=None, # time budget in seconds
17 | )
18 |
19 |
20 | def get_total_cost() -> float:
21 | total = 0.0
22 | for model in Model:
23 | total += token_meter[model].calculate_cost(model.token_cost)
24 | return total
25 |
26 |
27 | def get_total_usage() -> TokenUsage:
28 | usage = TokenUsage()
29 | for model in Model:
30 | usage += token_meter[model]
31 | return usage
32 |
33 |
34 | class CallCounter:
35 | def __init__(self):
36 | self.count = 0
37 |
38 | def count_new_call(self):
39 | self.count += 1
40 |
41 | def get_count(self) -> int:
42 | return self.count
43 |
44 |
45 | llm_call_counter = CallCounter()
46 |
--------------------------------------------------------------------------------
/base_agent/src/llm/providers/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Provider-specific implementations for different LLM services."""
7 |
8 | from .anthropic import AnthropicProvider
9 | from .openai import OpenAIProvider
10 | from .deepseek import DeepSeekProvider
11 | from .fireworks import FireworksProvider
12 |
13 | from .google import GoogleProvider
14 | from .google_rest import GoogleRESTProvider
15 | from .google_oai import GoogleOAIProvider
16 | from .vertex import VertexProvider
17 |
18 | __all__ = [
19 | "AnthropicProvider",
20 | "OpenAIProvider",
21 | "DeepSeekProvider",
22 | "FireworksProvider",
23 | "GoogleProvider",
24 | "GoogleRESTProvider",
25 | "GoogleOAIProvider",
26 | "VertexProvider",
27 | ]
28 |
--------------------------------------------------------------------------------
/base_agent/src/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from .representation import (
7 | get_schema_representation,
8 | ArgFormat,
9 | dumps,
10 | )
11 | from .xml_dumps import xml_dumps
12 | from .xml_parsing import xml_str_to_dict
13 | from .json_parsing import json_str_to_dict
14 |
15 |
16 | from typing import Type
17 | from pydantic import BaseModel
18 |
19 |
20 | async def args_str_to_dict(
21 | tool_args: str, guide_obj: Type[BaseModel], arg_format: ArgFormat, root_tag: str
22 | ) -> tuple[dict | None, str | None]:
23 |
24 | # Get schema representation for LLM fixing
25 | if arg_format == ArgFormat.JSON:
26 | return await json_str_to_dict(tool_args, guide_obj)
27 | else:
28 | tool_args = f"<{root_tag}>\n{tool_args}\n{root_tag}>"
29 | return await xml_str_to_dict(tool_args, guide_obj, root_tag=root_tag)
30 |
--------------------------------------------------------------------------------
/base_agent/src/schemas/representation.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | Utilities for consistent representation of Pydantic models.
8 | Provides both JSON and XML based formats optimized for LLM readability.
9 | """
10 |
11 | import json
12 | import logging
13 |
14 | from enum import Enum
15 | from typing import Type, Dict, Any, Union, get_args, get_origin, Literal, List
16 | from pydantic import BaseModel
17 |
18 | from .xml_dumps import xml_dumps
19 | from ..types.common import ArgFormat
20 |
21 | logger = logging.getLogger(__name__)
22 |
23 |
24 | def get_type_info(field: Any) -> str:
25 | """
26 | Get human-readable type info for a field.
27 | Handles both Pydantic fields and schema properties.
28 | """
29 | field_type = field.annotation
30 | parts = []
31 |
32 | # Handle Optional types first
33 | is_optional = False
34 | if get_origin(field_type) is Union and type(None) in get_args(field_type):
35 | is_optional = True
36 | parts.append("optional")
37 | field_type = next(t for t in get_args(field_type) if t is not type(None))
38 |
39 | # Handle Literal types
40 | if get_origin(field_type) is Literal:
41 | literal_values = get_args(field_type)
42 | options = ", ".join(f"'{val}'" for val in literal_values)
43 | parts.append(f"one of [{options}]")
44 | elif isinstance(field_type, type) and issubclass(field_type, Enum):
45 | options = ", ".join(f"'{item.name}'" for item in field_type)
46 | parts.append(f"one of [{options}]")
47 | else:
48 | # Get base type
49 | if get_origin(field_type) is list:
50 | item_type = get_args(field_type)[0]
51 | type_str = f"list of {_get_base_type(item_type)}"
52 | elif get_origin(field_type) is dict:
53 | key_type, val_type = get_args(field_type)
54 | type_str = (
55 | f"dict of {_get_base_type(key_type)} to {_get_base_type(val_type)}"
56 | )
57 | else:
58 | type_str = _get_base_type(field_type)
59 | parts.append(type_str)
60 |
61 | # Add constraints
62 | constraints = _get_field_constraints(field)
63 | if constraints:
64 | parts.extend(constraints)
65 |
66 | # Handle special cases first
67 | is_required = not is_optional and field.is_required()
68 |
69 | if (
70 | hasattr(field.default, "__class__")
71 | and field.default.__class__.__name__ == "PydanticUndefinedType"
72 | ) or field.default is Ellipsis:
73 | # Required field (PydanticUndefined or Ellipsis)
74 | parts.append("required")
75 | elif field.default_factory is not None:
76 | # Show empty container for defaults from factory functions
77 | if field_type == Dict or get_origin(field_type) is dict:
78 | parts.append("default: {}")
79 | elif field_type == List or get_origin(field_type) is list:
80 | parts.append("default: []")
81 | elif field.default is not None:
82 | # Explicit default value
83 | parts.append(f"default: {_format_default(field.default)}")
84 | elif is_optional:
85 | # Optional field without default
86 | parts.append("default: null")
87 |
88 | # Add description
89 | if field.description:
90 | parts.append(field.description)
91 |
92 | return ", ".join(parts)
93 |
94 |
95 | def _get_base_type(field_type: Type) -> str:
96 | """Map Python types to schema types."""
97 | type_map = {
98 | str: "string",
99 | int: "integer",
100 | float: "float",
101 | bool: "boolean",
102 | Any: "any",
103 | }
104 | # For custom classes, use class name
105 | if isinstance(field_type, type):
106 | if issubclass(field_type, BaseModel):
107 | return field_type.__name__.lower()
108 | elif issubclass(field_type, Enum):
109 | return "enum"
110 | return type_map.get(field_type, str(field_type))
111 |
112 |
113 | def _get_field_constraints(field: Any) -> list[str]:
114 | """Extract field constraints as readable strings."""
115 | constraints = []
116 | metadata = field.metadata if hasattr(field, "metadata") else []
117 |
118 | constraint_names = {
119 | "Gt": ("gt", "greater than"),
120 | "Ge": ("ge", "min"),
121 | "Lt": ("lt", "less than"),
122 | "Le": ("le", "max"),
123 | "MinLength": ("min_length", "min length"),
124 | "MaxLength": ("max_length", "max length"),
125 | "MinItems": ("min_items", "min items"),
126 | "MaxItems": ("max_items", "max items"),
127 | }
128 |
129 | for item in metadata:
130 | item_type = type(item).__name__
131 | if item_type in constraint_names:
132 | attr_name, label = constraint_names[item_type]
133 | value = getattr(item, attr_name)
134 | if value is not None:
135 | constraints.append(f"{label}: {value}")
136 |
137 | return constraints
138 |
139 |
140 | def _format_default(value: Any) -> str:
141 | """Format default values consistently."""
142 | # Check for PydanticUndefined (required field with no default)
143 | if (
144 | hasattr(value, "__class__")
145 | and value.__class__.__name__ == "PydanticUndefinedType"
146 | ):
147 | return "required"
148 |
149 | if isinstance(value, str):
150 | return f"'{value}'"
151 | elif isinstance(value, (list, dict)):
152 | return json.dumps(value)
153 | elif isinstance(value, Enum):
154 | return f"'{value.name}'"
155 | elif callable(value): # e.g., default_factory
156 | return "{}" # Show empty dict/list for factory defaults
157 | elif value is Ellipsis:
158 | return "required" # Explicit handling of Ellipsis
159 | elif value is None:
160 | return "null"
161 |
162 | return str(value).lower() if isinstance(value, bool) else str(value)
163 |
164 |
165 | def get_json_schema_representation(model: Type[BaseModel]) -> str:
166 | """
167 | Generate a JSON schema representation focused on LLM readability.
168 | """
169 | fields = model.model_fields
170 | output = []
171 |
172 | for field_name, field in fields.items():
173 | type_info = get_type_info(field)
174 | output.append(f'"{field_name}": {type_info}')
175 |
176 | return "{\n " + ",\n ".join(output) + "\n}"
177 |
178 |
179 | def get_xml_schema_representation(
180 | model: Type[BaseModel], root_tag: str | None = None
181 | ) -> str:
182 | """
183 | Generate an XML schema representation focused on LLM readability.
184 | """
185 | fields = model.model_fields
186 | # Add angle brackets for root tag if necessary
187 | output = [f"<{root_tag}>"] if root_tag else []
188 |
189 | for field_name, field in fields.items():
190 | info = get_type_info(field)
191 | output.append(f"<{field_name}>{info}{field_name}>")
192 |
193 | if root_tag:
194 | output.append(f"{root_tag}>")
195 | return "\n".join(output)
196 |
197 |
198 | def get_schema_representation(
199 | cls: Type[BaseModel], arg_format: ArgFormat, root_tag: str | None = None
200 | ) -> str:
201 | if arg_format == ArgFormat.JSON:
202 | return get_json_schema_representation(cls)
203 | else:
204 | return get_xml_schema_representation(cls, root_tag=root_tag)
205 |
206 |
207 | def dumps(
208 | instance: dict, format: ArgFormat, indent: int, root_tag: str | None = None
209 | ) -> str:
210 | if format == ArgFormat.JSON:
211 | return json.dumps(instance, indent=indent)
212 | else:
213 | return xml_dumps(instance, root_tag=root_tag, indent=indent)
214 |
--------------------------------------------------------------------------------
/base_agent/src/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | A module of Agent tools
8 | """
9 |
10 | from .base_tool import BaseTool, tool_registry
11 | from .file_tools import CloseFile, OpenFile
12 | from .edit_tools import OverwriteFile
13 | from .execute_command import ExecuteCommand
14 | from .directory_tools import ViewDirectory
15 | from .ripgrep_tool import RipGrepTool
16 |
17 | # TODO: expand the concept of toolkits and use throughout the agent implementations
18 | toolkits: dict[str, list[BaseTool]] = dict(
19 | coding=[
20 | ViewDirectory,
21 | ExecuteCommand,
22 | OpenFile,
23 | CloseFile,
24 | OverwriteFile,
25 | RipGrepTool,
26 | ]
27 | )
28 |
--------------------------------------------------------------------------------
/base_agent/src/tools/answer_submission.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import logging
7 |
8 | from pydantic import Field
9 |
10 | from .base_tool import BaseTool
11 | from ..schemas import args_str_to_dict
12 | from ..types.tool_types import ToolResult
13 | from ..types.agent_types import AgentInterface
14 | from ..types.common import ArgFormat
15 |
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 |
19 |
20 | class SubmitAnswer(BaseTool):
21 | """Tool for submitting answers to benchmark questions on disk.
22 |
23 | This is slightly different to the ReturnResult tool which is used to return
24 | a result from the end of agent function call.
25 | """
26 |
27 | TOOL_NAME = "submit_answer"
28 | TOOL_DESCRIPTION = """Submit an answer to a benchmark question. The answer should be clear and concise.
29 | The tool will attempt to parse your answer according to the benchmark's requirements.
30 | Your answer should be a complete response that directly addresses the question.
31 | It is very important that you do not include any extraneous words or content in the answer field that may make the parsing fail.
32 | """
33 |
34 | # reasoning: str = Field(
35 | # ...,
36 | # description="Reason about the answer you are going to submit and the correct format in which to do so",
37 | # )
38 |
39 | answer: str = Field(
40 | ..., description="Your complete answer to the benchmark question", min_length=1
41 | )
42 |
43 | def __init__(self, calling_agent: AgentInterface, **data):
44 | super().__init__(calling_agent=calling_agent, **data)
45 |
46 | async def run(self) -> ToolResult:
47 | """Execute the answer submission with parsing."""
48 | try:
49 | if not self._calling_agent._logdir:
50 | return ToolResult(
51 | tool_name=self.TOOL_NAME,
52 | success=False,
53 | errors="System error: no answer path available",
54 | )
55 |
56 | # Validate answer is not empty or just whitespace
57 | answer = self.answer.strip()
58 | if not answer:
59 | return ToolResult(
60 | tool_name=self.TOOL_NAME,
61 | success=False,
62 | errors="Answer cannot be empty",
63 | )
64 |
65 | # Save answer to disk
66 | path = self._calling_agent._logdir / "answer.txt"
67 | with open(path, "w") as f:
68 | f.write(answer)
69 |
70 | return ToolResult(tool_name=self.TOOL_NAME, success=True)
71 |
72 | except Exception as e:
73 | return ToolResult(
74 | tool_name=self.TOOL_NAME,
75 | success=False,
76 | errors=f"Failed to save answer: {str(e)}",
77 | )
78 |
79 | @classmethod
80 | async def args_str_to_dict(
81 | cls, args_str: str, arg_format: ArgFormat = ArgFormat.XML
82 | ) -> tuple[dict | None, str | None]:
83 | args_dict, parse_warnings = await args_str_to_dict(
84 | args_str, guide_obj=cls, arg_format=arg_format, root_tag="TOOL_ARGS"
85 | )
86 | if args_dict:
87 | args_dict["answer"] = str(args_dict["answer"])
88 | return args_dict, parse_warnings
89 |
90 | @classmethod
91 | def generate_examples(cls) -> list[tuple["SubmitAnswer", ToolResult]]:
92 | """Generate example uses of the submit_answer tool."""
93 | from ..agents.implementations import DemoAgent
94 |
95 | return [
96 | (
97 | cls(
98 | calling_agent=DemoAgent(),
99 | answer="5",
100 | ),
101 | ToolResult(tool_name=cls.TOOL_NAME, success=True),
102 | ),
103 | (
104 | cls(
105 | calling_agent=DemoAgent(),
106 | # reasoning="The speed of the car is 10mph",
107 | answer="10 miles per hour",
108 | ),
109 | ToolResult(
110 | tool_name=cls.TOOL_NAME, success=False, errors="Parser error"
111 | ),
112 | ),
113 | (
114 | cls(
115 | calling_agent=DemoAgent(),
116 | # reasoning="The calculated value is 1,234.5",
117 | answer="1,234.5",
118 | ),
119 | ToolResult(tool_name=cls.TOOL_NAME, success=True),
120 | ),
121 | ]
122 |
--------------------------------------------------------------------------------
/base_agent/src/tools/calculator.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import logging
7 |
8 | from pydantic import Field
9 |
10 | from .base_tool import BaseTool
11 | from ..types.tool_types import ToolResult
12 | from ..types.agent_types import AgentInterface
13 |
14 | logger = logging.getLogger(__name__)
15 | logger.setLevel(logging.INFO)
16 |
17 |
18 | class Calculator(BaseTool):
19 | TOOL_NAME = "calculate"
20 | TOOL_DESCRIPTION = """A calculator tool that evaluates mathematical expressions.
21 | Supports basic arithmetic operations (including +, -, *, / and ^) and parentheses.
22 | All expressions must contain only numbers and valid operators."""
23 |
24 | reasoning: str = Field(
25 | ..., description="Concise resoning about the operation to be performed"
26 | )
27 | expression: str = Field(
28 | ...,
29 | description="Mathematical expression to evaluate",
30 | pattern=r"^[\d\s\+\-\*\/\(\)\.]+$",
31 | )
32 |
33 | def __init__(self, calling_agent: AgentInterface, **data):
34 | super().__init__(calling_agent=calling_agent, **data)
35 |
36 | async def run(self) -> ToolResult:
37 | try:
38 | result = eval(self.expression)
39 | return ToolResult(
40 | tool_name=self.TOOL_NAME, success=True, output=str(result)
41 | )
42 | except Exception as e:
43 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
44 |
45 | @classmethod
46 | def generate_examples(cls) -> list[tuple["Calculator", ToolResult]]:
47 | from ..agents.implementations import DemoAgent
48 |
49 | return [
50 | (
51 | cls(
52 | calling_agent=DemoAgent(),
53 | reasoning="The number of fruit is the sum of the two apples and three oranges",
54 | expression="2 + 3",
55 | ),
56 | ToolResult(tool_name=cls.TOOL_NAME, success=True, output=str(5)),
57 | ),
58 | (
59 | cls(
60 | calling_agent=DemoAgent(),
61 | reasoning="The compound expression will require parentheses",
62 | expression="(3 * 4) / 2",
63 | ),
64 | ToolResult(tool_name=cls.TOOL_NAME, success=True, output=str(6)),
65 | ),
66 | ]
67 |
68 |
69 | if __name__ == "__main__":
70 | import asyncio
71 | from ..agents.implementations import DemoAgent
72 |
73 | async def test():
74 | c = Calculator(calling_agent=DemoAgent(), reasoning="...", expression="2+2")
75 | result = await c.run()
76 |
77 | assert result.tool_name == Calculator.TOOL_NAME
78 | assert result.success
79 | assert result.duration < 0.5
80 | assert result.output == str(4)
81 | print("All tests pass!")
82 |
83 | asyncio.run(test())
84 |
--------------------------------------------------------------------------------
/base_agent/src/tools/directory_tools.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import logging
7 |
8 | from pathlib import Path
9 | from pydantic import Field
10 |
11 | from .base_tool import BaseTool
12 | from ..utils.file_views import create_filetree, FileTreeOptions
13 | from ..types.tool_types import ToolResult
14 | from ..types.agent_types import AgentInterface
15 |
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(logging.INFO)
18 |
19 |
20 | class ViewDirectory(BaseTool):
21 | """Tool to generate a detailed view of directory contents."""
22 |
23 | TOOL_NAME = "view_directory"
24 | TOOL_DESCRIPTION = """View the contents of a directory with configurable depth and detail options.
25 |
26 | The tool provides a formatted tree view of the directory structure, including:
27 | - File and directory sizes
28 | - Permissions
29 | - Modification times
30 | - Smart collapsing of large directories
31 | - Configurable depth and detail level"""
32 |
33 | directory: str = Field(
34 | ...,
35 | description="The directory path to view",
36 | )
37 | max_depth: int = Field(
38 | default=2,
39 | description="Maximum depth to traverse (None for unlimited)",
40 | )
41 | show_hidden: bool = Field(
42 | default=False,
43 | description="Whether to show hidden files and directories",
44 | )
45 | collapse_threshold: int = Field(
46 | default=15,
47 | description="Number of items before a directory is collapsed (None for no collapsing)",
48 | )
49 | show_timestamps: bool = Field(
50 | default=False,
51 | description="Whether to show file modification timestamps",
52 | )
53 | exclude_patterns: list[str] = Field(
54 | default=[],
55 | description="List of glob patterns to exclude (e.g. '.git' or '*.pyc')",
56 | )
57 | show_full_filepaths: bool = Field(
58 | default=False,
59 | description="Whether to show the full filepaths from the root directory",
60 | )
61 |
62 | def __init__(self, calling_agent: AgentInterface, **data):
63 | super().__init__(calling_agent=calling_agent, **data)
64 |
65 | async def run(self) -> ToolResult:
66 | try:
67 | path = Path(self.directory)
68 | if not path.exists():
69 | return ToolResult(
70 | tool_name=self.TOOL_NAME,
71 | success=False,
72 | errors=f"Directory does not exist: {path}",
73 | )
74 | if not path.is_dir():
75 | return ToolResult(
76 | tool_name=self.TOOL_NAME,
77 | success=False,
78 | errors=f"Path is not a directory: {path}",
79 | )
80 |
81 | # Create options for the tree generation
82 | options = FileTreeOptions(
83 | collapse_threshold=self.collapse_threshold,
84 | show_hidden=self.show_hidden,
85 | exclude_patterns=(
86 | self.exclude_patterns
87 | if len(self.exclude_patterns) > 0 or self.show_hidden
88 | else None
89 | ),
90 | show_mtime=self.show_timestamps,
91 | min_dir_level=(
92 | 0 if self.max_depth is None else max(0, self.max_depth - 1)
93 | ),
94 | show_full_path=self.show_full_filepaths,
95 | )
96 |
97 | # Generate the tree
98 | tree_output = create_filetree(path, options)
99 |
100 | return ToolResult(
101 | tool_name=self.TOOL_NAME,
102 | success=True,
103 | output=f"Directory contents of {path}:\n{tree_output}",
104 | )
105 |
106 | except Exception as e:
107 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
108 |
109 | @classmethod
110 | def generate_examples(cls) -> list[tuple["ViewDirectory", ToolResult]]:
111 | from ..agents.implementations import DemoAgent
112 |
113 | return [
114 | # Basic directory view
115 | (
116 | cls(
117 | calling_agent=DemoAgent(),
118 | directory="/home/agent/workdir",
119 | max_depth=2,
120 | show_hidden=False,
121 | show_timestamps=False,
122 | exclude_patterns=[],
123 | collapse_threshold=20,
124 | ),
125 | ToolResult(
126 | tool_name=cls.TOOL_NAME,
127 | success=True,
128 | output="Directory contents of /home/agent/workdir:\n"
129 | "workdir/ [0755] (1.2MB, 25 files, 5 dirs)\n"
130 | " src/ [0755] (800KB, 15 files, 3 dirs)\n"
131 | " main.py [0644] 50KB\n"
132 | " utils.py [0644] 30KB\n"
133 | " tests/ [0755] (400KB, 10 files, 2 dirs) [collapsed]\n",
134 | ),
135 | ),
136 | # Detailed view with timestamps
137 | (
138 | cls(
139 | calling_agent=DemoAgent(),
140 | directory="/home/agent/project",
141 | max_depth=1,
142 | show_hidden=True,
143 | show_timestamps=True,
144 | exclude_patterns=[".git", "*.pyc"],
145 | collapse_threshold=15,
146 | ),
147 | ToolResult(
148 | tool_name=cls.TOOL_NAME,
149 | success=True,
150 | output="Directory contents of /home/agent/project:\n"
151 | "project/ [0755] (2.5MB, 40 files, 8 dirs) 2024-01-14 10:00\n"
152 | " .env [0644] 2KB 2024-01-14 09:55\n"
153 | " README.md [0644] 15KB 2024-01-14 09:50\n"
154 | " src/ [0755] (1.5MB, 25 files, 5 dirs) 2024-01-14 10:00\n"
155 | " tests/ [0755] (1MB, 15 files, 3 dirs) 2024-01-14 09:45\n",
156 | ),
157 | ),
158 | ]
159 |
--------------------------------------------------------------------------------
/base_agent/src/tools/edit_tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from .overwrite_file import OverwriteFile
7 |
8 | __all__ = ["OverwriteFile"]
9 |
--------------------------------------------------------------------------------
/base_agent/src/tools/edit_tools/overwrite_file.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import re
7 | import logging
8 |
9 | from pathlib import Path
10 | from pydantic import Field
11 |
12 | from .utils import edit_preflight_check, generate_edit_event_content
13 | from ...schemas.json_parsing import json_str_to_dict
14 | from ..base_tool import BaseTool, extract_between_patterns
15 | from ...events import EventBus
16 | from ...types.tool_types import ToolResult
17 | from ...types.event_types import EventType, FileOperation, FileEvent
18 | from ...types.agent_types import AgentInterface
19 | from ...types.common import ArgFormat
20 |
21 | logger = logging.getLogger(__name__)
22 | logger.setLevel(logging.INFO)
23 |
24 |
25 | class OverwriteFile(BaseTool):
26 | """Tool to overwrite an existing file or create a new one with content."""
27 |
28 | TOOL_NAME = "overwrite_file"
29 | TOOL_DESCRIPTION = f"""Use this tool when you want to write content verbatim to a file, either overwriting an existing file or creating a new one.
30 |
31 | For existing files:
32 | - You MUST have called the `open_file` tool to view the file before over-writing it
33 | - This is to make sure we're not over-writing anything of value that needs to be kept
34 | - The entire content will be replaced verbatim with the new content provided
35 |
36 | For new files:
37 | - 'Overwriting' a not-yet-existing file will create it
38 | - The file will be automatically opened in the context window after creation
39 |
40 | Very important notes:
41 | - The content you provide to this tool will be that file's new content. You must make sure to include absolutely everything you still need
42 | - Do NOT "fold" any code sections because this will cause errors. Instead, write out everything verbatim.
43 |
44 | - DO NOT, under any circumstances, call this tool for a file edit that exceeds about 500 lines. It will be slow, inefficient, costly and error-prone. For these types of large-file edits, you should seek to use more efficient editing tools.
45 | - You do not need to write out the file ahed of time before invoking this tool.
46 | """
47 |
48 | filepath: str = Field(
49 | ...,
50 | description="The full absolute filepath of the file to write. For existing files, must be already open in context window.",
51 | )
52 | full_unabridged_new_content: str = Field(
53 | ...,
54 | description="The full content to write to the file, which will entirely replace any existing content.",
55 | )
56 |
57 | def __init__(self, calling_agent: AgentInterface, **data):
58 | super().__init__(calling_agent=calling_agent, **data)
59 |
60 | @classmethod
61 | async def args_str_to_dict(
62 | cls, args_str: str, arg_format: ArgFormat = ArgFormat.XML
63 | ) -> tuple[dict | None, str | None]:
64 | if arg_format == ArgFormat.XML:
65 | # Carefully extract the content, with the assumption that there _will_
66 | # be conflicting tags.
67 | # First, manually get the content between
68 | filepath_pattern = f"(.*?)"
69 | filepath_match = re.search(filepath_pattern, args_str)
70 | filepath = filepath_match.group(1) if filepath_match else None
71 | if not filepath:
72 | return None, "Could not parse filepath"
73 |
74 | # Find the first opening tag
75 | content = extract_between_patterns(
76 | args_str, "", ""
77 | )
78 | if not content:
79 | return None, "Could not parse file content"
80 |
81 | return dict(filepath=filepath, full_unabridged_new_content=content), None
82 | else:
83 | return await json_str_to_dict(args_str, guide_obj=cls)
84 |
85 | async def run(self) -> ToolResult:
86 | try:
87 | path = Path(self.filepath)
88 | event_bus = await EventBus.get_instance()
89 |
90 | # Check if file exists
91 | file_exists = path.exists()
92 |
93 | if not file_exists:
94 | # Create directory structure if needed
95 | path.parent.mkdir(parents=True, exist_ok=True)
96 |
97 | # For new files, write content first
98 | try:
99 | path.write_text(self.full_unabridged_new_content)
100 |
101 | event = FileEvent(
102 | type=EventType.FILE_EVENT,
103 | content=self.full_unabridged_new_content,
104 | path=str(path),
105 | operation=FileOperation.OPEN,
106 | )
107 |
108 | await event_bus.publish(event, self._calling_agent._id)
109 |
110 | return ToolResult(
111 | tool_name=self.TOOL_NAME,
112 | success=True,
113 | output=f"Successfully created new file {path}",
114 | )
115 | except Exception as e:
116 | return ToolResult(
117 | tool_name=self.TOOL_NAME,
118 | success=False,
119 | errors=f"Failed to create new file {path}: {str(e)}",
120 | )
121 | else:
122 | # For existing files, verify it's open first
123 | result = await edit_preflight_check(
124 | path, self.TOOL_NAME, self._calling_agent
125 | )
126 | if result:
127 | return result
128 |
129 | prev_content = path.read_text()
130 |
131 | # Now write new content
132 | try:
133 | path.write_text(self.full_unabridged_new_content)
134 |
135 | diff_content, content_hash = generate_edit_event_content(
136 | prev_content, self.full_unabridged_new_content, str(path)
137 | )
138 |
139 | event = FileEvent(
140 | type=EventType.FILE_EVENT,
141 | content=diff_content,
142 | path=str(path),
143 | operation=FileOperation.EDIT,
144 | content_hash=content_hash,
145 | mtime=path.stat().st_mtime,
146 | )
147 |
148 | await event_bus.publish(event, self._calling_agent._id)
149 |
150 | return ToolResult(
151 | tool_name=self.TOOL_NAME,
152 | success=True,
153 | output=f"Successfully overwrote content of {path}",
154 | )
155 | except Exception as e:
156 | return ToolResult(
157 | tool_name=self.TOOL_NAME,
158 | success=False,
159 | errors=f"Failed to write to file {path}: {str(e)}",
160 | )
161 |
162 | except Exception as e:
163 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
164 |
165 | @classmethod
166 | def generate_examples(cls) -> list[tuple["OverwriteFile", ToolResult]]:
167 | from ...agents.implementations import DemoAgent
168 |
169 | return [
170 | # Example 1: Create new file
171 | (
172 | cls(
173 | calling_agent=DemoAgent(),
174 | filepath="/home/agent/workdir/new_file.txt",
175 | full_unabridged_new_content="Content for the new file",
176 | ),
177 | ToolResult(
178 | tool_name=cls.TOOL_NAME,
179 | success=True,
180 | output="Successfully created new file /home/agent/workdir/new_file.txt",
181 | ),
182 | ),
183 | # Example 2: Overwrite existing file
184 | (
185 | cls(
186 | calling_agent=DemoAgent(),
187 | filepath="/home/agent/workdir/example.txt",
188 | full_unabridged_new_content="New content for existing file",
189 | ),
190 | ToolResult(
191 | tool_name=cls.TOOL_NAME,
192 | success=True,
193 | output="Successfully overwrote content of /home/agent/workdir/example.txt",
194 | ),
195 | ),
196 | ]
197 |
--------------------------------------------------------------------------------
/base_agent/src/tools/edit_tools/utils.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import hashlib
7 | import difflib
8 |
9 | from pathlib import Path
10 | from datetime import datetime, timedelta
11 |
12 | from ...types.tool_types import ToolResult
13 | from ...types.agent_types import AgentInterface, InheritanceFlags
14 | from ...events.event_bus_utils import is_file_open, get_latest_file_event
15 |
16 |
17 | async def edit_preflight_check(
18 | path: Path, tool_name: str, calling_agent: AgentInterface
19 | ) -> ToolResult | None:
20 | inherits_parent_files = (
21 | InheritanceFlags.OPEN_FILES in calling_agent.INHERITANCE.flags
22 | )
23 |
24 | file_open: bool = await is_file_open(str(path), calling_agent._id)
25 | if inherits_parent_files and not file_open:
26 | file_open = await is_file_open(str(path), calling_agent._parent_id)
27 |
28 | # Verify file is open
29 | if not file_open:
30 | return ToolResult(
31 | tool_name=tool_name,
32 | success=False,
33 | errors=f"File {path} must be opened first using the open_file tool",
34 | )
35 |
36 | eps = timedelta(seconds=0.5)
37 | latest_file_event = await get_latest_file_event(
38 | str(path), calling_agent._id, exclude_close=True
39 | )
40 | # Assumes agent runs are blocking (i.e. all agent file events will be newer
41 | # than parent file events)
42 | if inherits_parent_files and not latest_file_event:
43 | latest_file_event = await get_latest_file_event(
44 | str(path),
45 | calling_agent._parent_id,
46 | exclude_close=True,
47 | )
48 |
49 | last_mod = datetime.fromtimestamp(path.stat().st_mtime)
50 | if not latest_file_event or last_mod > latest_file_event.timestamp + eps:
51 | last_viewed = (
52 | latest_file_event.timestamp.strftime("%Y-%m-%d %H:%M:%S")
53 | if latest_file_event
54 | else "Never"
55 | )
56 | return ToolResult(
57 | tool_name=tool_name,
58 | success=False,
59 | errors=(
60 | f"File {path} was changed at {last_mod.strftime('%Y-%m-%d %H:%M:%S')}, "
61 | f"which is after you last viewed or edited it at {last_viewed}."
62 | "Please view it again to get its latest contents before making your edit."
63 | ),
64 | )
65 |
66 |
67 | def generate_edit_event_content(
68 | old_content: str, new_content: str, path: str
69 | ) -> tuple[str, str]:
70 | """Generate a diff between old and new content for file events.
71 |
72 | Returns:
73 | tuple[str, str]: A tuple containing (content_for_event, content_hash)
74 | where content_for_event contains the diff and content_hash is the hash of new_content
75 | """
76 | if not old_content and new_content:
77 | # For new files, return the full content
78 | content_hash = hashlib.sha256(new_content.encode()).hexdigest()
79 | return new_content, content_hash
80 |
81 | # Generate unified diff
82 | old_lines = old_content.splitlines()
83 | new_lines = new_content.splitlines()
84 |
85 | diff = list(
86 | difflib.unified_diff(
87 | old_lines,
88 | new_lines,
89 | fromfile=f"a/{path}",
90 | tofile=f"b/{path}",
91 | lineterm="",
92 | )
93 | )
94 |
95 | if diff:
96 | diff_content = "\n".join(diff)
97 | else:
98 | diff_content = "No changes"
99 |
100 | content_hash = hashlib.sha256(new_content.encode()).hexdigest()
101 | return diff_content, content_hash
102 |
--------------------------------------------------------------------------------
/base_agent/src/tools/file_tools.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import logging
7 |
8 | from pathlib import Path
9 | from pydantic import Field
10 |
11 | from .base_tool import BaseTool
12 | from ..events import EventBus
13 | from ..events.event_bus_utils import get_open_file_set
14 | from ..types.tool_types import ToolResult
15 | from ..types.event_types import EventType, FileOperation, FileEvent
16 | from ..types.agent_types import AgentInterface
17 |
18 | logger = logging.getLogger(__name__)
19 | logger.setLevel(logging.INFO)
20 |
21 |
22 | class OpenFile(BaseTool):
23 | TOOL_NAME = "open_files"
24 | TOOL_DESCRIPTION = """A file viewer tool that allows you to see the contents of one or more files in your context window.
25 |
26 | Note, you should use /home/agent/workdir as your working directory if possible, althogh specifying file paths outside this directory will work.
27 |
28 | Features:
29 | - View multiple files at once
30 | - Optional line number display for easier reference
31 | - Automatic warning for non-text files
32 |
33 | VERY IMPORTANT NOTE: you should only open files that are in a plain text format (e.g. something that you might open in a code editor). Opening media files, binary formats or any other non-text format will lead to unpredictable results.
34 | """
35 |
36 | file_paths: list[str] = Field(
37 | ...,
38 | description="A list of one or more absolute filepaths to add to open in your context window",
39 | )
40 | show_line_numbers: bool = Field(
41 | False,
42 | description="When True, displays line numbers in the left margin of the file for easier reference.",
43 | )
44 |
45 | def __init__(self, calling_agent: AgentInterface, **data):
46 | super().__init__(calling_agent=calling_agent, **data)
47 |
48 | async def run(self) -> ToolResult:
49 | try:
50 | output_strings = []
51 | warnings = []
52 | total_lines = 0
53 | for fpath in self.file_paths:
54 | path = Path(fpath)
55 | if not path.exists():
56 | warnings.append(f"File path: {path} does not exist")
57 | continue
58 |
59 | output_strings.append(f"The file at {path} was opened successfully.")
60 |
61 | file_content = path.read_text()
62 | event = FileEvent(
63 | type=EventType.FILE_EVENT,
64 | content=file_content,
65 | path=str(fpath),
66 | operation=FileOperation.OPEN,
67 | metadata={"show_line_numbers": self.show_line_numbers},
68 | )
69 | total_lines += len(file_content.splitlines())
70 |
71 | event_bus = await EventBus.get_instance()
72 | await event_bus.publish(event, self._calling_agent._id)
73 |
74 | if total_lines > 750:
75 | warnings.append(f"You have added {total_lines} of content to the context, which is quite high. If this file is not immediately relevant to the task at hand, you should make sure to close it (and any other long files) with the close_file tool.")
76 |
77 | return ToolResult(
78 | tool_name=self.TOOL_NAME,
79 | success=True,
80 | output="\n".join(output_strings) if output_strings else None,
81 | warnings="\n".join(warnings) if warnings else None,
82 | )
83 | except Exception as e:
84 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
85 |
86 | @classmethod
87 | def generate_examples(cls) -> list[tuple["OpenFile", ToolResult]]:
88 | from ..agents.implementations import DemoAgent
89 |
90 | return [
91 | (
92 | cls(
93 | calling_agent=DemoAgent(),
94 | file_paths=["/home/agent/workdir/example.txt"],
95 | show_line_numbers=False,
96 | ),
97 | ToolResult(tool_name=cls.TOOL_NAME, success=True),
98 | ),
99 | (
100 | cls(
101 | calling_agent=DemoAgent(),
102 | file_paths=["/tmp/example.txt", "/home/agent/workdir/new.txt"],
103 | show_line_numbers=True,
104 | ),
105 | ToolResult(tool_name=cls.TOOL_NAME, success=True),
106 | ),
107 | ]
108 |
109 |
110 | class CloseFile(BaseTool):
111 | TOOL_NAME = "close_files"
112 | TOOL_DESCRIPTION = """Close one or more open files to clear up space in the context window.
113 |
114 | Note that you can call this tool with the empty list [] as the file_paths to close all open files.
115 | """
116 |
117 | file_paths: list[str] = Field(
118 | ...,
119 | description="A list of one or more absolute file paths to close. If this is the empty list, then all files will be closed",
120 | )
121 |
122 | def __init__(self, calling_agent: AgentInterface, **data):
123 | super().__init__(calling_agent=calling_agent, **data)
124 |
125 | async def run(self) -> ToolResult:
126 | try:
127 | event_bus = await EventBus.get_instance()
128 |
129 | if len(self.file_paths) == 0:
130 | open_files = await get_open_file_set(self._calling_agent._id)
131 | for open_event in open_files:
132 | close_event = FileEvent(
133 | type=EventType.FILE_EVENT,
134 | content="",
135 | path=open_event.path,
136 | operation=FileOperation.CLOSE,
137 | )
138 | await event_bus.publish(close_event, self._calling_agent._id)
139 | return ToolResult(
140 | tool_name=self.TOOL_NAME,
141 | success=True,
142 | )
143 |
144 | for fpath in self.file_paths:
145 | close_event = FileEvent(
146 | type=EventType.FILE_EVENT,
147 | content="",
148 | path=fpath,
149 | operation=FileOperation.CLOSE,
150 | )
151 | await event_bus.publish(close_event, self._calling_agent._id)
152 |
153 | return ToolResult(
154 | tool_name=self.TOOL_NAME,
155 | success=True,
156 | )
157 | except Exception as e:
158 | return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
159 |
160 | @classmethod
161 | def generate_examples(cls) -> list[tuple["CloseFile", ToolResult]]:
162 | from ..agents.implementations import DemoAgent
163 |
164 | return [
165 | (
166 | cls(
167 | calling_agent=DemoAgent(),
168 | file_paths=["/home/agent/workdir/example.txt"],
169 | ),
170 | ToolResult(tool_name=cls.TOOL_NAME, success=True),
171 | ),
172 | (
173 | cls(
174 | calling_agent=DemoAgent(),
175 | file_paths=[],
176 | ),
177 | ToolResult(tool_name=cls.TOOL_NAME, success=True),
178 | ),
179 | ]
180 |
--------------------------------------------------------------------------------
/base_agent/src/tools/reasoning_structures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/tools/reasoning_structures/__init__.py
--------------------------------------------------------------------------------
/base_agent/src/tools/reasoning_structures/coding.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | A reasoning structure for coding.
8 | """
9 | import logging
10 |
11 | from pydantic import PrivateAttr
12 |
13 | from .sequential import Step, ToolBasedReasoningStructure, _make_id
14 | from ...types.tool_types import ToolResult
15 | from ...types.agent_types import AgentInterface
16 |
17 | logger = logging.getLogger(__name__)
18 | logger.setLevel(logging.INFO)
19 |
20 |
21 | class CodingReasoningStructure(ToolBasedReasoningStructure):
22 |
23 | TOOL_NAME = "coding_reasoning_structure"
24 | TOOL_DESCRIPTION = """Apply this reasoning structure when you detect that you have a non-trivial coding implementation task that requires a methodical approach involving initial exploration, implementation, verification and cleanup to complete well.
25 |
26 | Do not call this tool if merely verifying, testing or if the task at hand is quick and does not require such rigour.
27 |
28 | This reasoning structure will guide you through good software engineering practices, and ensure that no steps have been missed out.
29 | """
30 | _steps: list[Step] = PrivateAttr(default_factory=lambda: [
31 | Step(
32 | identifier=_make_id(),
33 | instruction="Explore the project to a) locate all useful documentation (README.md files, common likely MD documentation files, etc), b) all files that may relate to your programming instructions, c) identify module-level and file-level design patterns and conventions.",
34 | done_description="You have viewed each of these files, made sure to close irrelevant or long files, and taken notes or summaries. Note that for greenfiled projects, this step may complete trivially.",
35 | failed_description="Files could not be opened for some reason, or the project location is unclear.",
36 | ),
37 | Step(
38 | identifier=_make_id(),
39 | instruction="Carefully implement the solution completely and thoroughly. Make sure you observe any existing stylistic conventions, and effectively re-use existing design patterns or modules to avoid duplicating functionality.",
40 | done_description="A first pass at the code implementation has been implemented, with tests not yet having been run.",
41 | failed_description="You have got stuck trying to get dependencies set up, getting mocks and fixtures set up, or have otherwise digressed from the core code implementation.",
42 | ),
43 | Step(
44 | identifier=_make_id(),
45 | instruction="Test the implementation end-to-end, favouring test scripts instead of test frameworks. If this is not an option or the project already has a test framework set up, then use that.",
46 | done_description="You have ensured that the code is valid, hasn't introduced any regressions and works as intended",
47 | failed_description="You have got stuck writing TDD loops, getting dependencies set up, getting mocks and fixtures set up",
48 | ),
49 | Step(
50 | identifier=_make_id(),
51 | instruction="Clean up: remove any temporary test scripts, toy implementations or other scaffolding. Check that all documentation and docstrings are up-to-date.",
52 | done_description="All temporary files have been removed, and documentation updated.",
53 | ),
54 | ])
55 |
56 | def __init__(self, calling_agent: AgentInterface, **data):
57 | super().__init__(calling_agent=calling_agent, **data)
58 |
59 | @classmethod
60 | def generate_examples(cls) -> list[tuple["CodingReasoningStructure", ToolResult]]:
61 | from ...agents.implementations import DemoAgent
62 |
63 | return [
64 | (
65 | cls(calling_agent=DemoAgent()),
66 | ToolResult(
67 | tool_name=cls.TOOL_NAME,
68 | success=True,
69 | output="The first step in the meta improvement process is: ...",
70 | ),
71 | ),
72 | ]
73 |
--------------------------------------------------------------------------------
/base_agent/src/tools/reasoning_structures/sequential_subagents.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | A proof of concept / template for a sub-agent based reasoning structure, where
8 | each step is a hard-coded sub-agent call.
9 | """
10 | import logging
11 |
12 | from uuid import uuid4
13 |
14 | from ..base_tool import BaseTool
15 | from ...types.tool_types import ToolResult
16 | from ...types.agent_types import AgentInterface
17 | from ...types.llm_types import FCI, ToolCallContent
18 | from ...agents.implementations.coder import CodingAgent
19 |
20 | logger = logging.getLogger(__name__)
21 | logger.setLevel(logging.INFO)
22 |
23 |
24 | class SubagentBasedReasoningStructure(BaseTool):
25 |
26 | TOOL_NAME = "example_subagent_reasoning_structure"
27 | TOOL_DESCRIPTION = """Reason through a fixed list of points sequentially."""
28 |
29 | def __init__(self, calling_agent: AgentInterface, **data):
30 | super().__init__(calling_agent=calling_agent, **data)
31 |
32 | async def run(self) -> ToolResult:
33 | parent_agent: AgentInterface = self._calling_agent
34 |
35 | try:
36 | await parent_agent._handle_agent_call(ToolCallContent(
37 | call_id=f"agent_{uuid4().hex[:8]}",
38 | tool_name=CodingAgent.AGENT_NAME,
39 | tool_args=dict(
40 | programming_instructions="Print 'a' in a file called 'a.txt'",
41 | ),
42 | call_type=FCI.UNCONSTRAINED, # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers.
43 | ))
44 |
45 | await parent_agent._handle_agent_call(ToolCallContent(
46 | call_id=f"agent_{uuid4().hex[:8]}",
47 | tool_name=CodingAgent.AGENT_NAME,
48 | tool_args=dict(
49 | programming_instructions="Print 'b' in a file called 'b.txt'",
50 | ),
51 | call_type=FCI.UNCONSTRAINED, # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers.
52 | ))
53 |
54 | await parent_agent._handle_agent_call(ToolCallContent(
55 | call_id=f"agent_{uuid4().hex[:8]}",
56 | tool_name=CodingAgent.AGENT_NAME,
57 | tool_args=dict(
58 | programming_instructions="Print 'c' in a file called 'c.txt'",
59 | ),
60 | call_type=FCI.UNCONSTRAINED, # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers.
61 | ))
62 |
63 | return ToolResult(
64 | tool_name=self.TOOL_NAME,
65 | success=True,
66 | output="Completed successfully"
67 | )
68 |
69 | except Exception as e:
70 | return ToolResult(
71 | tool_name=self.TOOL_NAME,
72 | success=False,
73 | errors=f"Error in sequential reasoning: {e}"
74 | )
75 |
76 | @classmethod
77 | def generate_examples(cls) -> list[tuple["SubagentBasedReasoningStructure", ToolResult]]:
78 | from ...agents.implementations import DemoAgent
79 |
80 | return [
81 | (
82 | cls(calling_agent=DemoAgent()),
83 | ToolResult(
84 | tool_name=cls.TOOL_NAME,
85 | success=True,
86 | output="Successfully did the ABC",
87 | ),
88 | ),
89 | ]
90 |
--------------------------------------------------------------------------------
/base_agent/src/types/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/types/__init__.py
--------------------------------------------------------------------------------
/base_agent/src/types/common.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from enum import Enum
7 |
8 |
9 | class ArgFormat(str, Enum):
10 | """Tool argument formats"""
11 |
12 | XML = "xml"
13 | JSON = "json"
14 |
--------------------------------------------------------------------------------
/base_agent/src/types/event_types.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import hashlib
7 |
8 | from enum import Enum
9 | from pathlib import Path
10 | from datetime import datetime
11 | from dataclasses import field, dataclass
12 |
13 |
14 | class EventType(Enum):
15 | ASSISTANT_MESSAGE = "assistant_message"
16 | ASSISTANT_REASONING = "assistant_reasoning"
17 | TOOL_CALL = "tool_call"
18 | TOOL_RESULT = "tool_result"
19 | AGENT_CALL = "agent_call"
20 | AGENT_RESULT = "agent_result"
21 | CORE_PROMPT_UPDATE = "core_prompt_update"
22 | SYSTEM_PROMPT_UPDATE = "system_prompt_update"
23 | FILE_EVENT = "file_event"
24 | APPLICATION_ERROR = "application_error"
25 | APPLICATION_WARNING = "application_warning"
26 | PROBLEM_STATEMENT = "problem_statement" # initial problem statement
27 | EXTERNAL_MESSAGE = "external_message" # subsequent update messages
28 | OVERSEER_NOTIFICATION = "overseer_notification"
29 | OVERSEER_UPDATE = "overseer_update" # for debugging
30 | BUDGET_INFO = "budget_info"
31 | TIMEOUT = "timeout"
32 | COST_LIMIT = "cost_limit"
33 |
34 |
35 | @dataclass
36 | class Event:
37 | """Base class for all events in the stream"""
38 |
39 | type: EventType
40 | content: str
41 | metadata: dict = field(default_factory=dict)
42 | timestamp: datetime = field(default_factory=datetime.now)
43 |
44 |
45 | class FileOperation(Enum):
46 | OPEN = "open"
47 | CLOSE = "close"
48 | EDIT = "edit"
49 |
50 |
51 | @dataclass
52 | class FileEvent:
53 | """Special event for file operations"""
54 |
55 | type: EventType
56 | content: str # NOTE: this is the formatted content, not just the raw file content (e.g. with line numbers, content hash, lsp diagnostics, etc)
57 | operation: FileOperation
58 | path: str
59 |
60 | timestamp: datetime = field(default_factory=datetime.now)
61 | metadata: dict = field(default_factory=dict) # NOTE: unused
62 |
63 | mtime: float = field(default=0.0)
64 | content_hash: str = field(default="")
65 | diff: str | None = None
66 | # lsp_diagnosdics: list = field(default_factory=list)
67 |
68 | def __post_init__(self):
69 | """Compute hash on initialization if not provided"""
70 | if not self.content_hash and self.content:
71 | self.content_hash = hashlib.sha256(self.content.encode()).hexdigest()
72 |
73 | if self.mtime == 0.0:
74 | try:
75 | self.mtime = Path(self.path).stat().st_mtime
76 | except Exception:
77 | pass
78 |
79 | def __hash__(self):
80 | return hash((self.type, self.operation, self.path, self.content_hash))
81 |
82 | def __eq__(self, other):
83 | if not isinstance(other, FileEvent):
84 | return False
85 | return (
86 | self.type == other.type
87 | and self.operation == other.operation
88 | and self.path == other.path
89 | and self.content_hash == other.content_hash
90 | )
91 |
--------------------------------------------------------------------------------
/base_agent/src/types/tool_types.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import os
7 |
8 | from abc import ABC, abstractmethod
9 | from typing import Any, ClassVar
10 | from pydantic import BaseModel, Field
11 | from ..schemas import dumps
12 | from ..types.common import ArgFormat
13 |
14 |
15 | class ToolResult(BaseModel):
16 | """Represents the result of a tool execution."""
17 |
18 | tool_name: str
19 | success: bool
20 | duration: float = 0.0 # on tool error paths, duration is often 0
21 | output: dict[str, Any] | str | None = None
22 | warnings: str | None = None
23 | errors: str | None = None
24 | invocation_id: str = Field(default_factory=lambda: os.urandom(4).hex())
25 |
26 | def __str__(self):
27 | str_output = self.output if isinstance(self.output, str) else None
28 | if isinstance(self.output, dict):
29 | str_output = dumps(self.output, ArgFormat.XML, indent=2)
30 |
31 | tool_response_str = ""
32 | tool_response_str += (
33 | f"\n{'SUCCESS' if self.success else 'FAILURE'}"
34 | )
35 | if str_output is not None:
36 | tool_response_str += f"\n"
37 | if self.warnings is not None:
38 | tool_response_str += f"\n{self.warnings}"
39 | if self.errors is not None:
40 | tool_response_str += f"\n{self.errors}"
41 | if self.duration is not None:
42 | tool_response_str += f"\n{self.duration:.3f}"
43 | tool_response_str += "\n"
44 |
45 | return tool_response_str
46 |
47 | def to_plain_string(self):
48 | str_output = self.output if isinstance(self.output, str) else None
49 | if isinstance(self.output, dict):
50 | str_output = dumps(self.output, ArgFormat.JSON, indent=2)
51 |
52 | tool_response_str = f"{self.tool_name} response:"
53 | tool_response_str += f"\nSuccess: {self.success}"
54 | if str_output is not None:
55 | tool_response_str += f"\nResult: {str_output}"
56 | if self.warnings is not None:
57 | tool_response_str += f"\nWarnings: {self.warnings}"
58 | if self.errors is not None:
59 | tool_response_str += f"\nErrors: {self.errors}"
60 | if self.duration is not None:
61 | tool_response_str += f"\nDuration: {self.duration:.3f}"
62 |
63 | return tool_response_str
64 |
65 |
66 | class ToolInterface(BaseModel, ABC):
67 | """Abstract interface for all tools"""
68 |
69 | # Class variables
70 | TOOL_NAME: ClassVar[str]
71 | TOOL_DESCRIPTION: ClassVar[str]
72 | EPHEMERAL: ClassVar[bool] = False
73 |
74 | class Config:
75 | extra = "forbid"
76 |
77 | @abstractmethod
78 | async def run(self) -> ToolResult:
79 | """Execute the tool's functionality"""
80 | pass
81 |
82 | @classmethod
83 | @abstractmethod
84 | def generate_examples(cls) -> list[tuple["BaseTool", ToolResult]]:
85 | """Generate example uses of the tool with their expected outputs"""
86 | pass
87 |
88 | @classmethod
89 | @abstractmethod
90 | def to_prompt_format(cls, arg_format: ArgFormat = ArgFormat.XML) -> str:
91 | """Convert the tool definition to XML format for the unconstrained tool use prompt."""
92 | pass
93 |
94 | @classmethod
95 | @abstractmethod
96 | def to_plain_prompt_format(cls, arg_format: ArgFormat = ArgFormat.JSON) -> str:
97 | """Convert the tool definition to a formatted string for the constrained tool use prompt.
98 |
99 | NOTE: most providers use JSON-like syntax in their prompts, so
100 | generating few-shot examples like this tends to work better.
101 | """
102 | pass
103 |
--------------------------------------------------------------------------------
/base_agent/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/utils/__init__.py
--------------------------------------------------------------------------------
/base_agent/src/utils/metrics.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | import random
7 |
8 | from datetime import datetime, timedelta
9 |
10 | from ..types.llm_types import TokenUsage
11 | from ..types.agent_types import AgentMetrics
12 |
13 |
14 | def make_random_agent_metrics(
15 | tools_enabled: bool = True,
16 | agents_enabled: bool = True,
17 | min_duration_seconds: int = 1,
18 | max_duration_seconds: int = 300,
19 | base_prompt_tokens: int = 500,
20 | token_variance: float = 0.3,
21 | cache_hit_rate: float = 0.4,
22 | cache_write_rate: float = 0.3,
23 | cost_per_1k_tokens: float = 0.002,
24 | seed: int = 42 # Added seed parameter
25 | ) -> AgentMetrics:
26 | """
27 | Generate random but plausible agent metrics deterministically.
28 |
29 | Args:
30 | tools_enabled: Whether tools are enabled for this agent
31 | agents_enabled: Whether sub-agents are enabled for this agent
32 | min_duration_seconds: Minimum execution duration in seconds
33 | max_duration_seconds: Maximum execution duration in seconds
34 | base_prompt_tokens: Base number of prompt tokens to vary around
35 | token_variance: How much to vary token counts (as proportion of base)
36 | cache_hit_rate: Proportion of tokens that should be cached hits
37 | cache_write_rate: Proportion of uncached tokens that should be written to cache
38 | cost_per_1k_tokens: Cost per 1000 tokens in dollars
39 | seed: Random seed for deterministic output
40 |
41 | Returns:
42 | AgentMetrics object with randomized but plausible values
43 | """
44 | # Set the random seed for reproducibility
45 | random.seed(seed)
46 |
47 | # Use a fixed base time instead of datetime.now()
48 | base_time = datetime(2025, 1, 1, 0, 0, 0) # Fixed starting point
49 | start_time = base_time - timedelta(days=random.randint(0, 7))
50 | duration = random.uniform(min_duration_seconds, max_duration_seconds)
51 | end_time = start_time + timedelta(seconds=duration)
52 |
53 | # Calculate base token counts with some variance
54 | variance_factor = 1 + random.uniform(-token_variance, token_variance)
55 | total_prompt_tokens = int(base_prompt_tokens * variance_factor)
56 |
57 | # Calculate cached vs uncached split
58 | cached_tokens = int(total_prompt_tokens * cache_hit_rate)
59 | uncached_tokens = total_prompt_tokens - cached_tokens
60 |
61 | # Calculate cache writes
62 | cache_writes = int(uncached_tokens * cache_write_rate)
63 |
64 | # Generate completion tokens (typically 20-80% of prompt tokens)
65 | completion_tokens = int(total_prompt_tokens * random.uniform(0.2, 0.8))
66 |
67 | # Calculate tool and agent calls if enabled
68 | tool_calls = 0
69 | agent_calls = 0
70 |
71 | if tools_enabled:
72 | # Typically 1-5 tool calls per interaction
73 | tool_calls = random.randint(1, 5)
74 |
75 | if agents_enabled:
76 | # Typically 0-3 agent calls per interaction
77 | agent_calls = random.randint(0, 3)
78 |
79 | # Calculate total cost
80 | total_tokens = total_prompt_tokens + completion_tokens
81 | cost = (total_tokens / 1000) * cost_per_1k_tokens
82 |
83 | return AgentMetrics(
84 | start_time=start_time,
85 | end_time=end_time,
86 | token_usage=TokenUsage(
87 | uncached_prompt_tokens=uncached_tokens - cache_writes,
88 | cache_write_prompt_tokens=cache_writes,
89 | cached_prompt_tokens=cached_tokens,
90 | completion_tokens=completion_tokens,
91 | ),
92 | cost=cost,
93 | tool_calls=tool_calls,
94 | agent_calls=agent_calls,
95 | )
96 |
--------------------------------------------------------------------------------
/base_agent/src/utils/parsing.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | Some parsing utilities.
8 |
9 | This module provides utilities for parsing various types of data,
10 | particularly focusing on numerical parsing from strings.
11 | """
12 |
13 | import re
14 |
15 | from typing import Optional, Literal
16 |
17 | def extract_before_last(text: str, pattern: str, keep_pattern: bool = False) -> str:
18 | last_pos = text.rfind(pattern)
19 | offset = len(pattern) if keep_pattern else 0
20 | return text[:last_pos + offset] if last_pos != -1 else ""
21 |
22 | def extract_after_last(text: str, pattern: str, keep_pattern: bool = False) -> str:
23 | last_pos = text.rfind(pattern)
24 | offset = 0 if keep_pattern else len(pattern)
25 | return text[last_pos + offset:] if last_pos != -1 else ""
26 |
27 |
28 | def extract_after_first(text: str, pattern: str, keep_pattern: bool = False) -> str:
29 | first_pos = text.find(pattern)
30 | offset = 0 if keep_pattern else len(pattern)
31 | return text[first_pos + offset:] if first_pos != -1 else ""
32 |
33 |
34 | def extract_between_patterns(
35 | s: str,
36 | pattern_a: str,
37 | pattern_b: str,
38 | a_occurrence: Literal["first"] | Literal["last"] = "first",
39 | b_occurrence: Literal["first"] | Literal["last"] = "last",
40 | ) -> str | None:
41 | # Validate both occurrences upfront
42 | if a_occurrence not in ("first", "last"):
43 | raise ValueError("Invalid value for a_occurrence. Use 'first' or 'last'.")
44 | if b_occurrence not in ("first", "last"):
45 | raise ValueError("Invalid value for b_occurrence. Use 'first' or 'last'.")
46 |
47 | # Determine the index for `pattern_a`
48 | if a_occurrence == "first":
49 | start_index = s.find(pattern_a)
50 | else: # "last"
51 | start_index = s.rfind(pattern_a)
52 |
53 | if start_index == -1:
54 | return None
55 |
56 | start_index += len(pattern_a)
57 |
58 | # Determine the index for `pattern_b`
59 | if b_occurrence == "first":
60 | end_index = s.find(pattern_b)
61 | else: # "last"
62 | end_index = s.rfind(pattern_b)
63 |
64 | if end_index == -1 or end_index <= start_index:
65 | return None
66 |
67 | return s[start_index:end_index]
68 |
69 |
70 | def parse_number_from_string(
71 | answer: str,
72 | ) -> tuple[bool, Optional[float], Optional[str]]:
73 | cleaned = answer.strip().replace(",", "")
74 |
75 | # Pattern for a valid number segment
76 | number_pattern = r"-?\d*\.?\d+(?:[eE][-+]?\d+)?"
77 | match = re.search(number_pattern, cleaned)
78 |
79 | if not match:
80 | return (
81 | False,
82 | None,
83 | "Could not find a number in the answer. Please provide a clear numerical response.",
84 | )
85 |
86 | matched_str = match.group()
87 | # Check for multiple decimal points in the matched string
88 | if matched_str.count(".") > 1:
89 | return (
90 | False,
91 | None,
92 | "Found what looks like a number but couldn't parse it: too many decimal points",
93 | )
94 |
95 | try:
96 | value = float(matched_str)
97 | full_match = matched_str == cleaned
98 | if not full_match:
99 | return (
100 | True,
101 | value,
102 | "Warning: Found additional text around the number. In future, try to provide just the number.",
103 | )
104 | return True, value, None
105 | except ValueError as e:
106 | return (
107 | False,
108 | None,
109 | f"Found what looks like a number but couldn't parse it: {str(e)}",
110 | )
111 |
--------------------------------------------------------------------------------
/base_agent/src/utils/stop_tokens.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | # WARNING: while you can read this file, however editing this file directly
7 | # will stop your generation and you will stop abruptly and fail!
8 | #
9 | # If you want to add a new stop token for the next agent itertion, then you
10 | # should append it to this file using a terminal tool like:
11 | # echo 'NEW_STOP_TOKEN = ""' >> tools/stop_tokens.py
12 | #
13 | # If you want to remove one, then make a line edit using something like:
14 | # sed -i 'd' tools/stop_tokens.py.
15 | # Note that the first token, TOOL_STOP_TOKEN, is on line 14 of this file after
16 | # this comment is counted. To delete it, you'd do:
17 | # sed -i '14d' tools/stop_tokens.py.
18 |
19 | TOOL_STOP_TOKEN = ""
20 | AGENT_STOP_TOKEN = ""
21 | OVERSEER_STOP_TOKEN = ""
22 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Web server package for callgraph visualization."""
7 |
8 | from .server import run_server
9 |
10 | __all__ = ["run_server"]
11 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/static/components/metrics-display.js:
--------------------------------------------------------------------------------
1 | /**
2 | * MetricsDisplay component
3 | */
4 |
5 | import { Component } from "../core.js";
6 | import { formatters } from "../utils/formatters.js";
7 | import { store } from "../store.js";
8 |
9 | export class MetricsDisplay extends Component {
10 | constructor() {
11 | super();
12 | this.attachShadow({ mode: "open" });
13 |
14 | // Add styles
15 | const style = document.createElement("style");
16 | style.textContent = `
17 | :host {
18 | display: flex;
19 | flex-wrap: wrap;
20 | align-items: center;
21 | color: white;
22 | }
23 | .metric {
24 | display: flex;
25 | align-items: center;
26 | margin-right: 1.5rem;
27 | }
28 | .label {
29 | font-size: 0.75rem;
30 | text-transform: uppercase;
31 | letter-spacing: 0.05em;
32 | color: #d1d5db;
33 | }
34 | .value {
35 | margin-left: 0.5rem;
36 | font-size: 0.875rem;
37 | font-weight: 500;
38 | }
39 | .cached {
40 | font-size: 0.75rem;
41 | color: #d1d5db;
42 | }
43 | `;
44 | this.shadowRoot.appendChild(style);
45 |
46 | // Create container
47 | this.container = document.createElement("div");
48 | this.container.style.display = "flex";
49 | this.container.style.flexWrap = "wrap";
50 | this.container.style.alignItems = "center";
51 | this.shadowRoot.appendChild(this.container);
52 |
53 | // Listen for state changes
54 | document.addEventListener("state-change", (e) => {
55 | if (e.detail.property === "callgraphData") {
56 | this.setState({ data: e.detail.value });
57 | }
58 | });
59 | }
60 |
61 | render() {
62 | const data = this.state.data || {};
63 |
64 | this.container.innerHTML = `
65 |
66 | Duration
67 | ${formatters.duration(data.total_duration)}
68 |
69 |
70 | Total Tokens
71 | ${formatters.tokens(data.total_tokens)}
72 | ${data.total_tokens ? formatters.cachePercent(data.num_cached_tokens, data.total_tokens) : "-"}
73 |
74 |
75 | Cost
76 | ${formatters.cost(data.total_cost)}
77 |
78 | `;
79 | }
80 | }
81 |
82 | customElements.define("metrics-display", MetricsDisplay);
83 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/static/core.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Core reactive system for the visualization
3 | */
4 |
5 | // Base Component class
6 | export class Component extends HTMLElement {
7 | constructor() {
8 | super();
9 | this.state = new Proxy(
10 | {},
11 | {
12 | set: (target, property, value) => {
13 | target[property] = value;
14 | this.render();
15 | return true;
16 | },
17 | },
18 | );
19 | }
20 |
21 | setState(newState) {
22 | Object.assign(this.state, newState);
23 | }
24 |
25 | render() {
26 | // Override in subclasses
27 | }
28 |
29 | connectedCallback() {
30 | this.render();
31 | }
32 | }
33 |
34 | // HTML escaping utility
35 | export function escapeHtml(unsafe) {
36 | return unsafe
37 | .replace(/&/g, "&")
38 | .replace(//g, ">")
40 | .replace(/"/g, """)
41 | .replace(/'/g, "'");
42 | }
43 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/static/store.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Centralized state management with WebSocket support
3 | */
4 |
5 | export const store = new Proxy(
6 | {
7 | callgraphData: null,
8 | },
9 | {
10 | set(target, property, value) {
11 | target[property] = value;
12 | document.dispatchEvent(
13 | new CustomEvent("state-change", {
14 | detail: { property, value },
15 | })
16 | );
17 | return true;
18 | },
19 | }
20 | );
21 |
22 | let socket;
23 |
24 | export async function updateVisualization() {
25 | try {
26 | const response = await fetch("/api/callgraph");
27 | const data = await response.json();
28 |
29 | // Skip if data hasn't changed
30 | if (JSON.stringify(data) !== JSON.stringify(store.callgraphData)) {
31 | store.callgraphData = data;
32 | }
33 | } catch (error) {
34 | console.error("Error updating visualization:", error);
35 | }
36 | }
37 |
38 | function connectWebSocket() {
39 | socket = new WebSocket(`ws://${window.location.host}/ws`);
40 |
41 | socket.onopen = () => {
42 | console.log("WebSocket connected");
43 | };
44 |
45 | socket.onmessage = (event) => {
46 | const message = JSON.parse(event.data);
47 | if (message.type === 'event') {
48 | // Get latest data to incorporate the new event
49 | updateVisualization();
50 | }
51 | };
52 |
53 | socket.onclose = () => {
54 | console.log("WebSocket disconnected. Reconnecting...");
55 | setTimeout(connectWebSocket, 1000);
56 | };
57 |
58 | socket.onerror = (error) => {
59 | console.error("WebSocket error:", error);
60 | };
61 | }
62 |
63 | // Start WebSocket connection and initial data load
64 | export function startUpdates() {
65 | updateVisualization(); // Initial load
66 | connectWebSocket(); // Real-time updates
67 | }
--------------------------------------------------------------------------------
/base_agent/src/web_server/static/styles.css:
--------------------------------------------------------------------------------
1 | /* Base styles */
2 | body {
3 | font-family: "Inter", sans-serif;
4 | color: #1f2937;
5 | }
6 |
7 | /* Global styles */
8 | .execution-tree,
9 | #event-stream {
10 | font-family: "JetBrains Mono", monospace;
11 | font-size: 13px;
12 | line-height: 1.3;
13 | }
14 |
15 | /* Header styles */
16 | .header {
17 | background: linear-gradient(90deg, #1e293b 0%, #334155 100%);
18 | }
19 |
20 | /* Tree visualization styles */
21 | .execution-tree {
22 | position: relative;
23 | }
24 |
25 | .execution-tree .node {
26 | margin-bottom: 0.25rem;
27 | position: relative;
28 | }
29 |
30 | .execution-tree .node-content {
31 | margin-left: 1.25rem;
32 | position: relative;
33 | }
34 |
35 | /* Vertical line for tree structure */
36 | .execution-tree .node-content::before {
37 | content: "";
38 | position: absolute;
39 | left: -12px;
40 | top: 0;
41 | bottom: 0;
42 | width: 2px;
43 | background-color: #e2e8f0;
44 | }
45 |
46 | /* Hover effect for collapsible areas */
47 | .execution-tree .node-content:hover::before {
48 | background-color: #93c5fd;
49 | }
50 |
51 | /* Reduce vertical space */
52 | .execution-tree .event-entry,
53 | #event-stream .event {
54 | padding-top: 0.125rem;
55 | padding-bottom: 0.125rem;
56 | }
57 |
58 | /* Event line styles */
59 | .event-line {
60 | position: relative;
61 | }
62 |
63 | /* Event stream styles */
64 | #event-stream .event {
65 | margin-bottom: 1rem;
66 | border-radius: 0.25rem;
67 | overflow: hidden;
68 | box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1);
69 | }
70 |
71 | #event-stream .event-content,
72 | #event-stream .event-full-content {
73 | background-color: #f8fafc;
74 | transition: background-color 0.2s;
75 | }
76 |
77 | #event-stream .event-content:hover,
78 | #event-stream .event-full-content:hover {
79 | background-color: #f1f5f9;
80 | }
81 |
82 | /* Execution tree hover styles */
83 | .execution-tree .cursor-pointer {
84 | transition: background-color 0.2s;
85 | }
86 |
87 | .execution-tree .cursor-pointer:hover {
88 | background-color: #eff6ff;
89 | }
90 |
91 | /* Status indicators */
92 | .status-indicator {
93 | display: inline-block;
94 | width: 10px;
95 | height: 10px;
96 | border-radius: 50%;
97 | margin-right: 0.5rem;
98 | }
99 |
100 | .status-pending {
101 | background-color: #fbbf24;
102 | }
103 |
104 | .status-running {
105 | background-color: #60a5fa;
106 | animation: pulse 2s infinite;
107 | }
108 |
109 | .status-success {
110 | background-color: #34d399;
111 | }
112 |
113 | .status-failed {
114 | background-color: #f87171;
115 | }
116 |
117 | @keyframes pulse {
118 | 0% {
119 | opacity: 1;
120 | }
121 | 50% {
122 | opacity: 0.6;
123 | }
124 | 100% {
125 | opacity: 1;
126 | }
127 | }
128 |
129 | /* Animation */
130 | @keyframes highlight {
131 | 0% {
132 | background-color: rgba(59, 130, 246, 0.1);
133 | }
134 | 50% {
135 | background-color: rgba(59, 130, 246, 0.1);
136 | }
137 | 100% {
138 | background-color: transparent;
139 | }
140 | }
141 |
142 | .event-highlight {
143 | animation: highlight 2s ease-in-out;
144 | }
145 |
146 | /* Scrollbar styles */
147 | ::-webkit-scrollbar {
148 | width: 8px;
149 | height: 8px;
150 | }
151 |
152 | ::-webkit-scrollbar-track {
153 | background: #f1f5f9;
154 | border-radius: 4px;
155 | }
156 |
157 | ::-webkit-scrollbar-thumb {
158 | background: #cbd5e1;
159 | border-radius: 4px;
160 | }
161 |
162 | ::-webkit-scrollbar-thumb:hover {
163 | background: #94a3b8;
164 | }
165 |
166 | /* Utility classes */
167 | .truncate {
168 | white-space: nowrap;
169 | overflow: hidden;
170 | text-overflow: ellipsis;
171 | }
172 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/static/utils/event-utils.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Event-related utilities
3 | */
4 |
5 | // Event type mapping for badges
6 | export function getEventBadge(type) {
7 | const badges = {
8 | assistant_message: "assistant",
9 | tool_call: "tool",
10 | tool_result: "tool",
11 | agent_call: "agent",
12 | agent_result: "agent",
13 | overseer_notification: "overseer",
14 | system_prompt_update: "system",
15 | core_prompt_update: "system",
16 | default: "system",
17 | };
18 | return badges[type] || badges.default;
19 | }
20 |
21 | // Get node status indicator
22 | export function getStatusIndicator(node) {
23 | if (!node.started_at) {
24 | return { class: "status-pending", label: "Pending" };
25 | }
26 | if (!node.completed_at) {
27 | return { class: "status-running", label: "Running" };
28 | }
29 | return node.success
30 | ? { class: "status-success", label: "Success" }
31 | : { class: "status-failed", label: "Failed" };
32 | }
33 |
34 | // Creates a chronological event stream from all events across all nodes
35 | export function createChronologicalEventStream(nodes) {
36 | const allEvents = [];
37 | Object.entries(nodes).forEach(([nodeId, node]) => {
38 | if (node.events) {
39 | allEvents.push(
40 | ...node.events.map((event) => ({
41 | nodeId,
42 | nodeName: node.name,
43 | event,
44 | time: new Date(event.timestamp),
45 | })),
46 | );
47 | }
48 | });
49 | return allEvents.sort((a, b) => a.time - b.time);
50 | }
51 |
52 | // Sort events while maintaining agent call sequence
53 | export function sortNodeEvents(events) {
54 | const sortedEvents = [];
55 | const tempEvents = [...events].sort(
56 | (a, b) => new Date(a.timestamp) - new Date(b.timestamp),
57 | );
58 |
59 | let i = 0;
60 | while (i < tempEvents.length) {
61 | const event = tempEvents[i];
62 | sortedEvents.push(event);
63 | i++;
64 |
65 | if (event.type === "agent_call") {
66 | const callTime = new Date(event.timestamp);
67 | const agentEvents = [];
68 | let j = i;
69 | let foundResult = false;
70 | while (j < tempEvents.length && !foundResult) {
71 | const nextEvent = tempEvents[j];
72 | if (
73 | nextEvent.type === "agent_result" &&
74 | new Date(nextEvent.timestamp) > callTime
75 | ) {
76 | agentEvents.push(nextEvent);
77 | tempEvents.splice(j, 1);
78 | foundResult = true;
79 | continue;
80 | }
81 | tempEvents.splice(j, 1);
82 | agentEvents.push(nextEvent);
83 | }
84 | sortedEvents.push(...agentEvents);
85 | }
86 | }
87 |
88 | return sortedEvents;
89 | }
90 |
91 | // UI interaction functions
92 | export function toggleContent(index) {
93 | // Get event-stream component
94 | const eventStream = document.querySelector("event-stream");
95 | if (eventStream && eventStream.shadowRoot) {
96 | const truncated = eventStream.shadowRoot.querySelector(
97 | `#event-${index} .event-content`,
98 | );
99 | const full = eventStream.shadowRoot.querySelector(`#event-full-${index}`);
100 | if (truncated && full) {
101 | if (truncated.classList.contains("hidden")) {
102 | truncated.classList.remove("hidden");
103 | full.classList.add("hidden");
104 | } else {
105 | truncated.classList.add("hidden");
106 | full.classList.remove("hidden");
107 | }
108 | }
109 | }
110 | }
111 |
112 | export function scrollToTop() {
113 | window.scrollTo({ top: 0, behavior: "smooth" });
114 | }
115 |
116 | export function scrollToStreamEvent(index) {
117 | // Get event-stream component
118 | const eventStream = document.querySelector("event-stream");
119 | if (eventStream && eventStream.shadowRoot) {
120 | const streamEvent = eventStream.shadowRoot.querySelector(`#event-${index}`);
121 | if (streamEvent) {
122 | streamEvent.scrollIntoView({ behavior: "smooth", block: "center" });
123 | streamEvent.classList.add("event-highlight");
124 | setTimeout(() => streamEvent.classList.remove("event-highlight"), 2000);
125 |
126 | // Expand the event details if needed
127 | const truncated = streamEvent.querySelector(`.event-content`);
128 | const full = streamEvent.querySelector(`#event-full-${index}`);
129 | if (truncated && full && truncated.classList.contains("hidden")) {
130 | toggleContent(index);
131 | }
132 | }
133 | }
134 | }
135 |
136 | export function toggleNode(nodeId) {
137 | const content = document.querySelector(`#${nodeId}-content`);
138 | if (content) {
139 | content.classList.toggle("hidden");
140 | }
141 | }
142 |
143 | // Expose required functions to window object for global access
144 | window.scrollToStreamEvent = scrollToStreamEvent;
145 | window.scrollToTop = scrollToTop;
146 | window.toggleContent = toggleContent;
147 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/static/utils/formatters.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Formatting utilities
3 | */
4 |
5 | export const formatters = {
6 | duration: (s) =>
7 | !s
8 | ? "0s"
9 | : s < 60
10 | ? `${s.toFixed(1)}s`
11 | : `${Math.floor(s / 60)}m ${(s % 60).toFixed(1)}s`,
12 | tokens: (t) =>
13 | !t
14 | ? "0"
15 | : t < 1000
16 | ? `${t}`
17 | : t < 1000000
18 | ? `${(t / 1000).toFixed(1)}K`
19 | : `${(t / 1000000).toFixed(1)}M`,
20 | cost: (c) =>
21 | !c
22 | ? "$0.00"
23 | : c < 0.01
24 | ? `$${c.toFixed(5)}`
25 | : c < 0.1
26 | ? `$${c.toFixed(4)}`
27 | : c < 1
28 | ? `$${c.toFixed(3)}`
29 | : `$${c.toFixed(2)}`,
30 | cachePercent: (cached, total) =>
31 | !total ? "0%" : `${((cached / total) * 100).toFixed(1)}% cached`,
32 | };
33 |
34 | // Get total tokens from usage object
35 | export function getTotalTokens(usage) {
36 | if (!usage) return 0;
37 | return (
38 | (usage.uncached_prompt_tokens || 0) +
39 | (usage.cache_write_prompt_tokens || 0) +
40 | (usage.cached_prompt_tokens || 0) +
41 | (usage.completion_tokens || 0)
42 | );
43 | }
44 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/static/visualizer.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Main visualization entry point
3 | */
4 |
5 | import { startUpdates } from "./store.js";
6 | import "./components/execution-tree.js";
7 | import "./components/event-stream.js";
8 | import "./components/metrics-display.js";
9 | import {
10 | toggleContent,
11 | toggleNode,
12 | scrollToTop,
13 | scrollToStreamEvent,
14 | } from "./utils/event-utils.js";
15 |
16 | // Make UI functions globally available
17 | window.toggleContent = toggleContent;
18 | window.toggleNode = toggleNode;
19 | window.scrollToTop = scrollToTop;
20 | window.scrollToStreamEvent = scrollToStreamEvent;
21 |
22 | // Start updates
23 | startUpdates();
24 |
--------------------------------------------------------------------------------
/base_agent/src/web_server/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Agent Execution
5 |
6 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
21 |
Agent Execution
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
Execution Tree
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
Event Stream
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/base_agent/tests/benchmarks/test_gsm8k_benchmark.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Tests for the GSM8K benchmark implementation."""
7 | import pytest
8 | import tempfile
9 | import os
10 | from pathlib import Path
11 | from unittest.mock import patch, MagicMock
12 |
13 | from src.benchmarks.gsm8k import GSM8KBenchmark, GSM8KExample
14 |
15 |
16 | class TestGSM8KExample:
17 | """Tests for the GSM8KExample class."""
18 |
19 | def test_from_raw(self):
20 | """Test conversion from raw dataset example."""
21 | # Create a mock raw example
22 | raw_example = {
23 | "question": "John has 5 apples. He buys 2 more. How many does he have now?",
24 | "answer": "John has 5 apples initially.\nHe buys 2 more apples.\nSo he has 5 + 2 = <<5+2=7>> apples in total.\n#### 7"
25 | }
26 |
27 | example = GSM8KExample.from_raw(raw_example)
28 |
29 | assert example.answer == raw_example["answer"]
30 | assert example.steps == [
31 | "John has 5 apples initially.",
32 | "He buys 2 more apples.",
33 | "So he has 5 + 2 = <<5+2=7>> apples in total."
34 | ]
35 | assert example.final_answer == 7.0
36 |
37 | def test_extract_calculations(self):
38 | """Test extraction of calculations from solution steps."""
39 | raw_example = {
40 | "question": "Calculation test",
41 | "answer": "Step 1: Calculate 2 + 3 = <<2+3=5>>\nStep 2: Multiply by 4: 5 × 4 = <<5*4=20>>\n#### 20"
42 | }
43 |
44 | example = GSM8KExample.from_raw(raw_example)
45 | calculations = example.extract_calculations()
46 |
47 | assert len(calculations) == 2
48 |
49 | # First calculation
50 | expr1, expected1, actual1 = calculations[0]
51 | assert expr1 == "2+3"
52 | assert expected1 == 5
53 | assert actual1 == 5
54 |
55 | # Second calculation
56 | expr2, expected2, actual2 = calculations[1]
57 | assert expr2 == "5*4"
58 | assert expected2 == 20
59 | assert actual2 == 20
60 |
61 |
62 | @pytest.mark.parametrize("subset_size", [None, 5, 10])
63 | def test_benchmark_initialization(subset_size):
64 | """Test initializing the GSM8K benchmark with various subset sizes."""
65 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
66 | # Mock the dataset loading
67 | mock_dataset = MagicMock()
68 | mock_dataset.__getitem__.return_value = [
69 | {"question": f"Question {i}", "answer": f"Some steps\n#### {i}"}
70 | for i in range(1, 21) # Create 20 mock examples
71 | ]
72 | mock_load_dataset.return_value = mock_dataset
73 |
74 | benchmark = GSM8KBenchmark(seed=42, subset_size=subset_size)
75 |
76 | # Check benchmark properties
77 | assert benchmark.name == "gsm8k"
78 |
79 | # Verify subset_size is respected
80 | if subset_size:
81 | assert len(benchmark.problems) == subset_size
82 | else:
83 | assert len(benchmark.problems) == 20 # All examples
84 |
85 | # Verify problems have the expected structure
86 | for problem in benchmark.problems:
87 | assert isinstance(problem.statement, str)
88 | assert isinstance(problem.problem_id, str) # Just check it's a string
89 | assert isinstance(problem.answer, float)
90 | assert isinstance(problem.answer_discussion, str)
91 |
92 |
93 | @pytest.mark.asyncio
94 | async def test_score_problem_correct():
95 | """Test scoring a correct GSM8K answer."""
96 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
97 | # Mock the dataset loading
98 | mock_dataset = MagicMock()
99 | mock_dataset.__getitem__.return_value = [
100 | {"question": "Question 1", "answer": "Some steps\n#### 42"}
101 | ]
102 | mock_load_dataset.return_value = mock_dataset
103 |
104 | benchmark = GSM8KBenchmark(seed=42, subset_size=1)
105 | problem = benchmark.problems[0]
106 |
107 | # Create a temporary directory for the answer
108 | with tempfile.TemporaryDirectory() as tmp_dir:
109 | answer_dir = Path(tmp_dir)
110 |
111 | # Create answer.txt with the correct answer
112 | answer_file = answer_dir / "answer.txt"
113 | answer_file.write_text("42")
114 |
115 | # Score the answer
116 | score, errors, discussion = await benchmark.score_problem(
117 | problem=problem,
118 | agent_workdir="/fake/workdir",
119 | agent_answer_dir=str(answer_dir),
120 | container_name="fake_container"
121 | )
122 |
123 | # Verify the scoring
124 | assert score == 1.0
125 | assert errors is None
126 | assert discussion is not None
127 |
128 |
129 | @pytest.mark.asyncio
130 | async def test_score_problem_incorrect():
131 | """Test scoring an incorrect GSM8K answer."""
132 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
133 | # Mock the dataset loading
134 | mock_dataset = MagicMock()
135 | mock_dataset.__getitem__.return_value = [
136 | {"question": "Question 1", "answer": "Some steps\n#### 42"}
137 | ]
138 | mock_load_dataset.return_value = mock_dataset
139 |
140 | benchmark = GSM8KBenchmark(seed=42, subset_size=1)
141 | problem = benchmark.problems[0]
142 |
143 | # Create a temporary directory for the answer
144 | with tempfile.TemporaryDirectory() as tmp_dir:
145 | answer_dir = Path(tmp_dir)
146 |
147 | # Create answer.txt with an incorrect answer
148 | answer_file = answer_dir / "answer.txt"
149 | answer_file.write_text("43")
150 |
151 | # Score the answer
152 | score, errors, discussion = await benchmark.score_problem(
153 | problem=problem,
154 | agent_workdir="/fake/workdir",
155 | agent_answer_dir=str(answer_dir),
156 | container_name="fake_container"
157 | )
158 |
159 | # Verify the scoring
160 | assert score == 0.0
161 | assert errors is None
162 | assert discussion is not None
163 |
164 |
165 | @pytest.mark.asyncio
166 | async def test_score_problem_invalid_format():
167 | """Test scoring a GSM8K answer with invalid format."""
168 | with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
169 | # Mock the dataset loading
170 | mock_dataset = MagicMock()
171 | mock_dataset.__getitem__.return_value = [
172 | {"question": "Question 1", "answer": "Some steps\n#### 42"}
173 | ]
174 | mock_load_dataset.return_value = mock_dataset
175 |
176 | benchmark = GSM8KBenchmark(seed=42, subset_size=1)
177 | problem = benchmark.problems[0]
178 |
179 | # Create a temporary directory for the answer
180 | with tempfile.TemporaryDirectory() as tmp_dir:
181 | answer_dir = Path(tmp_dir)
182 |
183 | # Create answer.txt with an incorrectly formatted answer
184 | answer_file = answer_dir / "answer.txt"
185 | answer_file.write_text("The answer is forty-two")
186 |
187 | # Score the answer
188 | score, errors, discussion = await benchmark.score_problem(
189 | problem=problem,
190 | agent_workdir="/fake/workdir",
191 | agent_answer_dir=str(answer_dir),
192 | container_name="fake_container"
193 | )
194 |
195 | # Verify the scoring
196 | assert score == 0.0
197 | assert errors is not None # Should have parsing errors
198 | assert "could not convert string to float" in errors.lower() or "invalid literal" in errors.lower()
199 |
--------------------------------------------------------------------------------
/base_agent/tests/test_example.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from src.types.llm_types import Model
7 |
8 | def test_example():
9 | assert True
10 | assert isinstance(Model.SONNET_35.id, str)
11 |
--------------------------------------------------------------------------------
/base_agent/tests/tools/reasoning_structures/test_sequential.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Tests for the Sequential reasoning structure tool."""
7 | import pytest
8 | from unittest.mock import patch, AsyncMock
9 |
10 | from src.tools.reasoning_structures.sequential import (
11 | ToolBasedReasoningStructure, Step, InvocationState, _make_id
12 | )
13 | from src.agents.implementations import DemoAgent
14 | from src.types.tool_types import ToolResult
15 |
16 |
17 | # Do not use global pytestmark
18 | # Apply asyncio marker only to functions that need it
19 | @pytest.mark.asyncio
20 | async def test_initialization():
21 | """Test proper initialization of the reasoning structure."""
22 | structure = ToolBasedReasoningStructure(calling_agent=DemoAgent())
23 |
24 | # Verify basic properties
25 | assert structure.TOOL_NAME == "example_reasoning_structure"
26 | assert hasattr(structure, "_steps")
27 | assert len(structure._steps) > 0
28 | assert all(isinstance(step, Step) for step in structure._steps)
29 |
30 |
31 | @pytest.mark.asyncio
32 | async def test_run_initializes_state():
33 | """Test that run() correctly initializes state."""
34 | agent = DemoAgent()
35 | structure = ToolBasedReasoningStructure(calling_agent=agent)
36 |
37 | # Run the reasoning structure
38 | result = await structure.run()
39 |
40 | # Verify state initialization
41 | assert len(agent._local_state) == 1
42 |
43 | invocation_id = next(iter(agent._local_state.keys()))
44 | invocation = agent._local_state[invocation_id]
45 |
46 | assert isinstance(invocation, InvocationState)
47 | assert invocation.steps == structure._steps
48 | assert invocation.current_step_id == structure._steps[0].identifier
49 | assert invocation.current_step_complete_tool is not None
50 |
51 |
52 | @pytest.mark.asyncio
53 | async def test_run_registers_completion_tool():
54 | """Test that run() registers a completion tool for the first step."""
55 | # Create an empty mock registry
56 | mock_registry = {}
57 |
58 | # Apply the patch within the test
59 | with patch("src.tools.reasoning_structures.sequential.tool_registry", mock_registry):
60 | agent = DemoAgent()
61 | structure = ToolBasedReasoningStructure(calling_agent=agent)
62 |
63 | # Run the reasoning structure
64 | await structure.run()
65 |
66 | # Verify a tool was registered
67 | assert len(mock_registry) == 1
68 |
69 | # Get the registered tool
70 | tool_name = next(iter(mock_registry.keys()))
71 |
72 | # Verify it's a completion tool
73 | assert tool_name.endswith("_complete")
74 | assert mock_registry[tool_name] in agent._available_tools
75 |
76 |
77 | @pytest.mark.asyncio
78 | async def test_run_returns_correct_result():
79 | """Test that run() returns the expected result structure."""
80 | structure = ToolBasedReasoningStructure(calling_agent=DemoAgent())
81 |
82 | # Run the reasoning structure
83 | result = await structure.run()
84 |
85 | # Verify result properties
86 | assert isinstance(result, ToolResult)
87 | assert result.tool_name == structure.TOOL_NAME
88 | assert result.success is True
89 | assert "step id" in result.output.lower()
90 | assert "step instructions" in result.output.lower()
91 |
92 |
93 | @pytest.mark.asyncio
94 | async def test_step_completion_tool_creation():
95 | """Test the creation of step completion tools."""
96 | # Setup a mock for create_step_tool
97 | with patch("src.tools.reasoning_structures.sequential.create_step_tool") as mock_create_step_tool:
98 | # Setup mock return value
99 | mock_tool_cls = AsyncMock()
100 | mock_create_step_tool.return_value = mock_tool_cls
101 |
102 | # Create and run structure
103 | structure = ToolBasedReasoningStructure(calling_agent=DemoAgent())
104 | await structure.run()
105 |
106 | # Verify tool creation
107 | mock_create_step_tool.assert_called_once()
108 |
109 | # Check arguments
110 | args = mock_create_step_tool.call_args[0]
111 | assert isinstance(args[0], str) # invocation_id
112 | assert isinstance(args[1], Step) # step
113 |
114 | # No asyncio marker for this function since it's synchronous
115 | def test_step_creation_utility():
116 | """Test the utility function for creating step identifiers."""
117 | # Generate IDs with custom prefix
118 | ids = [_make_id("test_prefix") for _ in range(5)]
119 |
120 | # Verify uniqueness
121 | assert len(ids) == len(set(ids))
122 |
123 | # Verify format
124 | for id in ids:
125 | assert id.startswith("test_prefix_")
126 | assert len(id) > len("test_prefix_")
127 |
128 | # Verify default prefix works
129 | default_id = _make_id()
130 | assert default_id.startswith("step_")
131 |
--------------------------------------------------------------------------------
/base_agent/tests/tools/test_base_tool.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """Tests for the BaseTool class functionality."""
7 | import pytest
8 | from unittest.mock import Mock, patch
9 | import asyncio
10 | from typing import Optional
11 |
12 | # Fix the import paths to work when running from the base_agent directory
13 | from src.tools.base_tool import BaseTool, tool_registry
14 | from src.types.tool_types import ToolResult
15 | from src.types.agent_types import AgentInterface
16 | from src.types.common import ArgFormat
17 |
18 | class TestBaseTool:
19 | """Test suite for BaseTool class."""
20 |
21 | def setup_method(self):
22 | """Setup for each test method."""
23 | # Save the original registry and clear it for testing
24 | self.original_registry = dict(tool_registry)
25 | tool_registry.clear()
26 |
27 | def teardown_method(self):
28 | """Teardown after each test method."""
29 | # Restore the original registry after each test
30 | tool_registry.clear()
31 | tool_registry.update(self.original_registry)
32 |
33 | def test_tool_registration(self):
34 | """Test that tools are properly registered through metaclass."""
35 | # Define a test tool class
36 | class TestTool(BaseTool):
37 | TOOL_NAME = "test_tool"
38 | TOOL_DESCRIPTION = "A test tool for registration"
39 |
40 | async def run(self) -> ToolResult:
41 | return ToolResult(tool_name=self.TOOL_NAME, success=True)
42 |
43 | @classmethod
44 | def generate_examples(cls):
45 | return []
46 |
47 | # Verify the tool was registered correctly
48 | assert "test_tool" in tool_registry
49 | assert tool_registry["test_tool"] == TestTool
50 |
51 | @pytest.mark.asyncio
52 | async def test_tool_examples(self):
53 | """Test that generate_examples returns valid examples."""
54 | # Define a test tool with examples
55 | class ExampleTool(BaseTool):
56 | TOOL_NAME = "example_tool"
57 | TOOL_DESCRIPTION = "Test tool with examples"
58 |
59 | async def run(self) -> ToolResult:
60 | return ToolResult(tool_name=self.TOOL_NAME, success=True)
61 |
62 | @classmethod
63 | def generate_examples(cls):
64 | # Return a minimal valid example
65 | mock_agent = Mock(spec=AgentInterface)
66 | tool_instance = cls(calling_agent=mock_agent)
67 | tool_result = ToolResult(tool_name=cls.TOOL_NAME, success=True)
68 | return [(tool_instance, tool_result)]
69 |
70 | # Check examples format
71 | examples = ExampleTool.generate_examples()
72 |
73 | assert isinstance(examples, list)
74 | assert len(examples) == 1
75 | example = examples[0]
76 | assert isinstance(example, tuple)
77 | assert len(example) == 2
78 | assert isinstance(example[0], ExampleTool)
79 | assert isinstance(example[1], ToolResult)
80 |
81 | @pytest.mark.asyncio
82 | async def test_args_str_to_dict(self):
83 | """Test XML and JSON argument parsing."""
84 | from pydantic import Field
85 |
86 | class ArgTool(BaseTool):
87 | TOOL_NAME = "arg_tool"
88 | TOOL_DESCRIPTION = "Test tool with arguments"
89 |
90 | arg1: str = Field(..., description="Test argument")
91 | arg2: int = Field(default=0, description="Optional argument")
92 |
93 | async def run(self) -> ToolResult:
94 | return ToolResult(tool_name=self.TOOL_NAME, success=True)
95 |
96 | @classmethod
97 | def generate_examples(cls):
98 | return []
99 |
100 | # Test XML parsing
101 | xml_args = """
102 |
103 | test
104 | 42
105 |
106 | """
107 | args_dict, warnings = await ArgTool.args_str_to_dict(xml_args, ArgFormat.XML)
108 | assert args_dict is not None
109 | assert args_dict["arg1"] == "test"
110 | assert args_dict["arg2"] == 42
111 | assert warnings is None
112 |
113 | # Test bad XML - this should result in a warning and possibly a None args_dict
114 | # or a dict with only default values, depending on the implementation
115 | bad_xml = "test"
116 | args_dict, warnings = await ArgTool.args_str_to_dict(bad_xml, ArgFormat.XML)
117 | # The important thing is that a warning is generated
118 | assert warnings is not None
119 |
120 | # We don't make assumptions about whether args_dict is None or partially populated
121 | # as implementation details can vary. If it's None, the test passes.
122 | # If not None, check that it doesn't contain the required field or that it does have defaults.
123 | if args_dict is not None:
124 | # It might contain default values but not the required field
125 | assert "arg1" not in args_dict, "Required field should not be present in malformed XML"
126 | # Optionally check if default values are preserved
127 | # We don't assert this as it's an implementation detail that could change
128 | # assert args_dict.get("arg2") == 0, "Default value should be present"
129 |
130 | @pytest.mark.asyncio
131 | async def test_tool_result_formatting(self):
132 | """Test that tool results are properly formatted."""
133 | # Create a mock agent for testing
134 | mock_agent = Mock(spec=AgentInterface)
135 |
136 | # Define a simple test tool
137 | class ResultTool(BaseTool):
138 | TOOL_NAME = "result_tool"
139 | TOOL_DESCRIPTION = "Test tool for result formatting"
140 |
141 | async def run(self) -> ToolResult:
142 | return ToolResult(
143 | tool_name=self.TOOL_NAME,
144 | success=True,
145 | output="test output",
146 | warnings="test warning",
147 | errors=None
148 | )
149 |
150 | @classmethod
151 | def generate_examples(cls):
152 | return []
153 |
154 | # Test successful tool execution
155 | tool = ResultTool(calling_agent=mock_agent)
156 | result = await tool.run()
157 |
158 | # Check result structure
159 | assert isinstance(result, ToolResult)
160 | assert result.tool_name == "result_tool"
161 | assert result.success is True
162 | assert "test output" in str(result)
163 | assert "test warning" in str(result)
164 |
165 | # Test failure result formatting
166 | failure_result = ToolResult(
167 | tool_name="fail_tool",
168 | success=False,
169 | output=None,
170 | warnings=None,
171 | errors="test error"
172 | )
173 |
174 | # Check failure result structure
175 | assert isinstance(failure_result, ToolResult)
176 | assert failure_result.tool_name == "fail_tool"
177 | assert failure_result.success is False
178 | assert "test error" in str(failure_result)
179 | assert "SUCCESS" not in str(failure_result)
180 | assert "FAILURE" in str(failure_result)
181 |
182 | if __name__ == "__main__":
183 | # Run the tests directly for debugging
184 | pytest.main(["-xvs", __file__])
185 |
--------------------------------------------------------------------------------
/base_agent/tests/utils/test_parsing.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | Tests for the parsing utilities module.
8 | """
9 | import re
10 | import pytest
11 | from src.utils.parsing import (
12 | extract_before_last,
13 | extract_after_last,
14 | extract_after_first,
15 | extract_between_patterns,
16 | parse_number_from_string,
17 | )
18 |
19 |
20 | # Test extract_before_last
21 | @pytest.mark.parametrize(
22 | "text, pattern, keep_pattern, expected",
23 | [
24 | ("hello world hello", "hello", False, "hello world "), # Basic case
25 | ("hello world hello", "hello", True, "hello world hello"), # Keep pattern
26 | ("no pattern here", "xyz", False, ""), # Pattern not found
27 | ("", "hello", False, ""), # Empty string
28 | ("hello", "hello", False, ""), # Pattern at end
29 | ],
30 | ids=["basic", "keep_pattern", "not_found", "empty", "end_pattern"],
31 | )
32 | def test_extract_before_last(text, pattern, keep_pattern, expected):
33 | result = extract_before_last(text, pattern, keep_pattern)
34 | assert result == expected, f"Expected '{expected}', got '{result}'"
35 |
36 |
37 | # Test extract_after_last
38 | @pytest.mark.parametrize(
39 | "text, pattern, keep_pattern, expected",
40 | [
41 | ("hello world hello", "hello", False, ""), # Last occurrence at end
42 | ("hello world hello", "hello", True, "hello"), # Keep pattern
43 | ("hello world hello", "world", False, " hello"), # Middle occurrence
44 | ("no pattern here", "xyz", False, ""), # Pattern not found
45 | ("hello", "hello", True, "hello"), # Single pattern
46 | ],
47 | ids=["end", "keep_pattern", "middle", "not_found", "single"],
48 | )
49 | def test_extract_after_last(text, pattern, keep_pattern, expected):
50 | result = extract_after_last(text, pattern, keep_pattern)
51 | assert result == expected
52 |
53 |
54 | # Test extract_after_first
55 | @pytest.mark.parametrize(
56 | "text, pattern, keep_pattern, expected",
57 | [
58 | ("hello world hello", "hello", False, " world hello"), # First occurrence
59 | ("hello world hello", "hello", True, "hello world hello"), # Keep pattern
60 | ("no pattern here", "xyz", False, ""), # Pattern not found
61 | ("hello", "he", False, "llo"), # Partial pattern
62 | ("", "xyz", False, ""), # Empty string
63 | ],
64 | ids=["basic", "keep_pattern", "not_found", "partial", "empty"],
65 | )
66 | def test_extract_after_first(text, pattern, keep_pattern, expected):
67 | result = extract_after_first(text, pattern, keep_pattern)
68 | assert result == expected
69 |
70 |
71 | # Test extract_between_patterns
72 | @pytest.mark.parametrize(
73 | "text, pattern_a, pattern_b, a_occ, b_occ, expected",
74 | [
75 | # First/Last combinations
76 | ("start middle end", "start", "end", "first", "last", " middle "),
77 | ("a b a c a d", "a", "a", "first", "last", " b a c "),
78 | ("a b a c a d", "a", "a", "last", "first", None), # Invalid range
79 | # Pattern not found
80 | ("hello world", "xyz", "abc", "first", "last", None),
81 | ("hello world", "hello", "xyz", "first", "last", None),
82 | # Edge cases
83 | ("", "a", "b", "first", "last", None), # Empty string
84 | ("abc", "a", "c", "first", "last", "b"), # Adjacent patterns
85 | ],
86 | ids=[
87 | "first_last",
88 | "multiple_a_last",
89 | "invalid_range",
90 | "a_missing",
91 | "b_missing",
92 | "empty",
93 | "adjacent",
94 | ],
95 | )
96 | def test_extract_between_patterns(text, pattern_a, pattern_b, a_occ, b_occ, expected):
97 | result = extract_between_patterns(text, pattern_a, pattern_b, a_occ, b_occ)
98 | assert result == expected
99 |
100 |
101 | # Test extract_between_patterns with invalid occurrence values
102 | @pytest.mark.parametrize(
103 | "a_occ, b_occ",
104 | [("invalid", "first"), ("first", "invalid")],
105 | ids=["invalid_a", "invalid_b"],
106 | )
107 | def test_extract_between_patterns_invalid_occurrence(a_occ, b_occ):
108 | with pytest.raises(ValueError, match="Invalid value for.*occurrence"):
109 | extract_between_patterns("text", "a", "b", a_occ, b_occ)
110 |
111 |
112 | # Fixture for parse_number_from_string tests
113 | @pytest.fixture
114 | def number_parser():
115 | return parse_number_from_string
116 |
117 |
118 | # Test parse_number_from_string
119 | @pytest.mark.parametrize(
120 | "input_str, expected",
121 | [
122 | # Successful cases
123 | ("42", (True, 42.0, None)),
124 | ("-3.14", (True, -3.14, None)),
125 | ("1,234.56", (True, 1234.56, None)), # Commas removed
126 | (" 6.022e23 ", (True, 6.022e23, None)), # Scientific notation
127 | # Success with warning
128 | ("42 extra text", (True, 42.0, "Warning: Found additional text.*")),
129 | # Failure cases
130 | ("no number here", (False, None, "Could not find a number.*")),
131 | ("", (False, None, "Could not find a number.*")),
132 | ],
133 | ids=[
134 | "integer",
135 | "negative_float",
136 | "comma_float",
137 | "scientific",
138 | "extra_text",
139 | "no_number",
140 | "empty",
141 | ],
142 | )
143 | def test_parse_number_from_string(number_parser, input_str, expected):
144 | success, value, message = number_parser(input_str)
145 | assert success == expected[0]
146 | assert value == expected[1]
147 | if message is not None and expected[2] is not None:
148 | assert re.match(expected[2], message) # Match regex pattern for message
149 | else:
150 | assert message == expected[2]
151 |
152 |
153 | # Example of a slow test (for demonstration)
154 | @pytest.mark.slow
155 | def test_parse_number_from_string_slow(number_parser):
156 | import time
157 | time.sleep(1) # Simulate slow operation
158 | success, value, _ = number_parser("12345")
159 | assert success and value == 12345.0
160 |
--------------------------------------------------------------------------------
/benchmark_data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/benchmark_data/.gitkeep
--------------------------------------------------------------------------------
/figures/agent_execution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/figures/agent_execution.png
--------------------------------------------------------------------------------
/figures/agent_loop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/figures/agent_loop.png
--------------------------------------------------------------------------------
/results/interactive_output/agent_outputs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/results/interactive_output/agent_outputs/.gitkeep
--------------------------------------------------------------------------------
/sandbox/Dockerfile:
--------------------------------------------------------------------------------
1 | # Based on Fedora
2 | FROM fedora:42
3 |
4 | # Accept TARGET_ARCH build argument
5 | ARG TARGET_ARCH=x86_64
6 |
7 | # Set up the environment variables
8 | ENV SANDBOX_DIR=/home/agent \
9 | SHELL=/bin/bash \
10 | TZ=Etc/UTC \
11 | DEBIAN_FRONTEND=noninteractive \
12 | PATH=/opt/miniconda3/bin:$PATH
13 |
14 | # Setup agent user with sudo access
15 | RUN useradd -m -d /home/agent -s ${SHELL} agent && \
16 | echo "agent ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/agent && \
17 | echo "Defaults env_keep += \"PATH\"" >> /etc/sudoers.d/agent && \
18 | chmod 0440 /etc/sudoers.d/agent
19 |
20 | # Install common dev tools
21 | RUN dnf -y install dnf-plugins-core && \
22 | dnf -y remove selinux-policy* && \
23 | dnf -y update && \
24 | dnf -y install \
25 | gcc gcc-c++ make git git-lfs llvm llvm-devel clang clang-devel \
26 | nodejs python3.12 python3.12-devel cmake openssh-server \
27 | tmux lsof strace gdb ltrace valgrind inotify-tools jq pv bzip2 unzip \
28 | p7zip wget curl sudo file tree which gettext-envsubst patch openssl \
29 | rsync zip nmap-ncat ripgrep perf poppler-utils lapack-devel blas-devel \
30 | openssl-devel libffi-devel procps-ng sysstat htop \
31 | libtiff-devel golang awk \
32 | # System and networking utilities
33 | hostname net-tools iproute iputils bind-utils tcpdump traceroute mtr \
34 | psmisc lsof netcat telnet whois tar gzip less findutils
35 |
36 | # Install Miniconda in /opt and set permissions
37 | USER root
38 | RUN mkdir -p /opt/miniconda3 && \
39 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${TARGET_ARCH}.sh -O /opt/miniconda3/miniconda.sh && \
40 | bash /opt/miniconda3/miniconda.sh -b -u -p /opt/miniconda3 && \
41 | rm /opt/miniconda3/miniconda.sh && \
42 | chown -R agent:agent /opt/miniconda3 && \
43 | chmod -R u+w /opt/miniconda3
44 |
45 |
46 | # Configure conda
47 | RUN /opt/miniconda3/bin/conda init --all && \
48 | /opt/miniconda3/bin/conda config --append channels conda-forge
49 |
50 | # Create system-wide conda initialization
51 | RUN echo '. /opt/miniconda3/etc/profile.d/conda.sh' >> /etc/bashrc && \
52 | echo 'source /opt/miniconda3/bin/activate' >> /etc/bashrc && \
53 | mkdir -p /etc/profile.d && \
54 | echo '. /opt/miniconda3/etc/profile.d/conda.sh' >> /etc/profile.d/conda.sh && \
55 | echo 'source /opt/miniconda3/bin/activate' >> /etc/profile.d/conda.sh && \
56 | chmod +x /etc/profile.d/conda.sh
57 |
58 | # Switch back to root for system configurations
59 | USER root
60 |
61 | # Setup Python 3.12 as default python
62 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 && \
63 | alternatives --set python3 /usr/bin/python3.12 && \
64 | alternatives --install /usr/bin/python python /usr/bin/python3 1
65 |
66 | # Create necessary directories for pnpm
67 | RUN mkdir -p ${SANDBOX_DIR}/.local/share/pnpm && \
68 | touch ${SANDBOX_DIR}/.bashrc && \
69 | chown -R agent:agent ${SANDBOX_DIR}/.local && \
70 | chown agent:agent ${SANDBOX_DIR}/.bashrc && \
71 | chown agent:agent ${SANDBOX_DIR}
72 |
73 | COPY configs/gitignore ${SANDBOX_DIR}/.gitignore
74 | RUN chown agent:agent ${SANDBOX_DIR}/.gitignore && \
75 | chmod +w ${SANDBOX_DIR}/.gitignore
76 |
77 | # Switch back to agent user for remaining setup
78 | USER agent
79 | WORKDIR ${SANDBOX_DIR}
80 |
81 | # Set directory permissions
82 | RUN mkdir -p ${SANDBOX_DIR}/.ssh && \
83 | chmod 700 ${SANDBOX_DIR}/.ssh && \
84 | touch ${SANDBOX_DIR}/.ssh/authorized_keys && \
85 | chmod 600 ${SANDBOX_DIR}/.ssh/authorized_keys
86 |
87 | RUN curl https://raw.githubusercontent.com/github/gitignore/main/Python.gitignore >> ${SANDBOX_DIR}/.gitignore && \
88 | curl https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore >> ${SANDBOX_DIR}/.gitignore
89 |
90 | # Install and setup pnpm
91 | ENV PNPM_HOME=${SANDBOX_DIR}/.local/share/pnpm
92 | ENV PATH=$PNPM_HOME:$PATH
93 | ENV NODE_OPTIONS=--max_old_space_size=4096
94 | RUN curl -fsSL https://get.pnpm.io/install.sh | ENV="${SANDBOX_DIR}/.bashrc" SHELL="/bin/bash" bash - && \
95 | . ${SANDBOX_DIR}/.bashrc && \
96 | echo "export PNPM_HOME=$PNPM_HOME" >> ${SANDBOX_DIR}/.bashrc && \
97 | echo "export PATH=$PNPM_HOME:\$PATH" >> ${SANDBOX_DIR}/.bashrc && \
98 | . ${SANDBOX_DIR}/.bashrc && \
99 | pnpm install -g typescript ts-node @types/node prettier eslint tsx
100 |
101 | # Install and configure Rust using rustup
102 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
103 | . ${SANDBOX_DIR}/.cargo/env && \
104 | rustup component add rust-src && \
105 | echo '. ${SANDBOX_DIR}/.cargo/env' >> ${SANDBOX_DIR}/.bashrc
106 |
107 | # Install LSP Servers for common languages with architecture awareness
108 | RUN . ${SANDBOX_DIR}/.bashrc && \
109 | # Python - Pyright
110 | sudo dnf install -y npm && \
111 | sudo npm install -g pyright && \
112 | # JavaScript/TypeScript
113 | pnpm install -g typescript-language-server typescript && \
114 | # Rust - Install and configure Rust using rustup
115 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
116 | . ${SANDBOX_DIR}/.cargo/env && \
117 | rustup component add rust-src rust-analyzer && \
118 | echo '. ${SANDBOX_DIR}/.cargo/env' >> ${SANDBOX_DIR}/.bashrc \
119 | # Go - Install gopls
120 | go install golang.org/x/tools/gopls@latest
121 |
122 | # Configure environment
123 | ENV HOME=${SANDBOX_DIR}
124 |
125 | # Copy and install some base requirements
126 | COPY base_requirements.txt /tmp/base_requirements.txt
127 | RUN pip install -r /tmp/base_requirements.txt && \
128 | sudo rm /tmp/base_requirements.txt
129 |
130 | # Copy and install agent dependencies (maintaining current approach)
131 | COPY --from=base_agent --chown=agent:agent . /tmp/base_agent
132 | RUN cd /tmp/base_agent && pip install -r requirements.txt
133 |
134 | WORKDIR ${SANDBOX_DIR}
135 |
136 | # Expose necessary ports (maintaining current approach)
137 | EXPOSE 5000 80 22 443 8080 8000
138 |
139 | ARG ANTHROPIC_API_KEY
140 | ARG OPENAI_API_KEY
141 | ARG FIREWORKS_AI_API_KEY
142 | ARG GEMINI_API_KEY
143 | ARG DEEPSEEK_API_KEY
144 | ARG VERTEX_PROJECT_ID
145 |
146 | ENV ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
147 | ENV OPENAI_API_KEY=${OPENAI_API_KEY}
148 | ENV FIREWORKS_AI_API_KEY=${FIREWORKS_AI_API_KEY}
149 | ENV GEMINI_API_KEY=${GEMINI_API_KEY}
150 | ENV DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY}
151 | ENV VERTEX_PROJECT_ID=${VERTEX_PROJECT_ID}
152 |
153 | COPY GOOGLE_APPLICATION_CREDENTIALS.json /tmp/GOOGLE_APPLICATION_CREDENTIALS.json
154 | ENV GOOGLE_APPLICATION_CREDENTIALS=/tmp/GOOGLE_APPLICATION_CREDENTIALS.json
155 |
156 | # Set the entrypoint (maintaining current approach)
157 | CMD ["/bin/bash", "--login"]
158 |
--------------------------------------------------------------------------------
/sandbox/GOOGLE_APPLICATION_CREDENTIALS.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "service_account",
3 | "project_id": "",
4 | "private_key_id": "",
5 | "private_key": "",
6 | "client_email": "",
7 | "client_id": "",
8 | "auth_uri": "",
9 | "token_uri": "",
10 | "auth_provider_x509_cert_url": "",
11 | "client_x509_cert_url": "",
12 | "universe_domain": ""
13 | }
14 |
--------------------------------------------------------------------------------
/sandbox/base_requirements.txt:
--------------------------------------------------------------------------------
1 | # System-wide Python packages for development
2 | # Intentionally _not_ pinning versions so we get recent versions on every build
3 | black
4 | flake8
5 |
--------------------------------------------------------------------------------
/sandbox/configs/gitignore:
--------------------------------------------------------------------------------
1 | # IDEs
2 | .idea/
3 | .vscode/
4 | *.swp
5 | *.swo
6 |
7 | # Build outputs
8 | target/
9 | dist/
10 | build/
11 | *.o
12 | *.a
13 | *.so
14 |
15 | # Logs & temp
16 | *.log
17 | tmp/
18 | temp/
19 |
20 | # Directories to ignore at any depth
21 | **/.maestro
22 | **/.vscode
23 | **/.vscode-server
24 | **/.ssh
25 |
--------------------------------------------------------------------------------
/sandbox/configs/sandbox_bashrc:
--------------------------------------------------------------------------------
1 | # sandbox_bashrc
2 |
3 | # Guard against sourcing multiple times
4 | if [ -n "$SANDBOX_BASHRC_SOURCED" ]; then
5 | return
6 | fi
7 | export SANDBOX_BASHRC_SOURCED=1
8 |
9 | # If not running interactively, don't do anything
10 | [[ $- != *i* ]] && return
11 |
12 | # User-specific environment
13 | if ! [[ "$PATH" =~ "$HOME/.local/bin:$HOME/bin:" ]]
14 | then
15 | PATH="$HOME/.local/bin:$HOME/bin:$PATH"
16 | fi
17 | export PATH
18 |
19 | # User specific aliases and functions
20 | if [ -d ~/.bashrc.d ]; then
21 | for rc in ~/.bashrc.d/*; do
22 | if [ -f "$rc" ]; then
23 | . "$rc"
24 | fi
25 | done
26 | fi
27 |
28 | unset rc
29 |
--------------------------------------------------------------------------------
/scripts/install_swebench_harness.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | git clone https://github.com/swe-bench/SWE-bench
3 | cd SWE-bench
4 | pip install -e .
5 |
--------------------------------------------------------------------------------