├── .gitignore
├── CITATION.cff
├── LICENSE
├── Makefile
├── README.md
├── base_agent
    ├── .gitignore
    ├── README.md
    ├── __main__.py
    ├── agent.py
    ├── agent_change_log.md
    ├── conftest.py
    ├── description.txt
    ├── pytest.ini
    ├── requirements.txt
    ├── src
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── agent_calling.py
    │   │   ├── assistant_base_agent.py
    │   │   ├── base_agent.py
    │   │   └── implementations
    │   │   │   ├── __init__.py
    │   │   │   ├── archive_explorer.py
    │   │   │   ├── coder.py
    │   │   │   ├── main_orchestrator.py
    │   │   │   ├── problem_solver.py
    │   │   │   ├── reasoner.py
    │   │   │   └── review_committee_member.py
    │   ├── benchmarks
    │   │   ├── __init__.py
    │   │   ├── aime.py
    │   │   ├── aiq_benchmark.py
    │   │   ├── aiq_project_benchmarks.py
    │   │   ├── arc_agi.py
    │   │   ├── base.py
    │   │   ├── drop.py
    │   │   ├── file_editing.py
    │   │   ├── gpqa.py
    │   │   ├── gsm8k.py
    │   │   ├── gsm_ic.py
    │   │   ├── humaneval.py
    │   │   ├── livecodebench.py
    │   │   ├── math.py
    │   │   ├── refute.py
    │   │   ├── swebench_verified.py
    │   │   └── symbol_location.py
    │   ├── callgraph
    │   │   ├── __init__.py
    │   │   ├── digraph.py
    │   │   ├── manager.py
    │   │   └── reporting.py
    │   ├── config.py
    │   ├── events
    │   │   ├── __init__.py
    │   │   ├── event_bus.py
    │   │   └── event_bus_utils.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── base.py
    │   │   ├── metering.py
    │   │   └── providers
    │   │   │   ├── __init__.py
    │   │   │   ├── anthropic.py
    │   │   │   ├── base_provider.py
    │   │   │   ├── deepseek.py
    │   │   │   ├── fireworks.py
    │   │   │   ├── google.py
    │   │   │   ├── google_oai.py
    │   │   │   ├── google_rest.py
    │   │   │   ├── openai.py
    │   │   │   └── vertex.py
    │   ├── oversight
    │   │   ├── graph_visualisation.py
    │   │   └── overseer.py
    │   ├── schemas
    │   │   ├── __init__.py
    │   │   ├── json_parsing.py
    │   │   ├── representation.py
    │   │   ├── xml_dumps.py
    │   │   └── xml_parsing.py
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── answer_submission.py
    │   │   ├── archive_tools.py
    │   │   ├── base_agent_tools.py
    │   │   ├── base_tool.py
    │   │   ├── calculator.py
    │   │   ├── committee_design.py
    │   │   ├── directory_tools.py
    │   │   ├── edit_tools
    │   │   │   ├── __init__.py
    │   │   │   ├── overwrite_file.py
    │   │   │   └── utils.py
    │   │   ├── execute_command.py
    │   │   ├── file_tools.py
    │   │   ├── reasoning_structures
    │   │   │   ├── __init__.py
    │   │   │   ├── coding.py
    │   │   │   ├── meta_improvement.py
    │   │   │   ├── sequential.py
    │   │   │   └── sequential_subagents.py
    │   │   └── ripgrep_tool.py
    │   ├── types
    │   │   ├── __init__.py
    │   │   ├── agent_types.py
    │   │   ├── common.py
    │   │   ├── event_types.py
    │   │   ├── llm_types.py
    │   │   └── tool_types.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── archive_analysis.py
    │   │   ├── documentation.py
    │   │   ├── file_views.py
    │   │   ├── metrics.py
    │   │   ├── parsing.py
    │   │   └── stop_tokens.py
    │   └── web_server
    │   │   ├── __init__.py
    │   │   ├── server.py
    │   │   ├── static
    │   │       ├── components
    │   │       │   ├── event-stream.js
    │   │       │   ├── execution-tree.js
    │   │       │   └── metrics-display.js
    │   │       ├── core.js
    │   │       ├── store.js
    │   │       ├── styles.css
    │   │       ├── utils
    │   │       │   ├── event-utils.js
    │   │       │   └── formatters.js
    │   │       └── visualizer.js
    │   │   └── templates
    │   │       └── index.html
    └── tests
    │   ├── agents
    │       └── test_agent_calling.py
    │   ├── benchmarks
    │       ├── disabled_test_livecode_benchmark.py
    │       ├── test_benchmark_base.py
    │       ├── test_file_editing.py
    │       ├── test_gsm8k_benchmark.py
    │       └── test_refute_benchmark.py
    │   ├── events
    │       └── test_event_bus.py
    │   ├── test_example.py
    │   ├── tools
    │       ├── reasoning_structures
    │       │   └── test_sequential.py
    │       ├── test_base_tool.py
    │       ├── test_calculator.py
    │       └── test_execute_command.py
    │   └── utils
    │       ├── test_archive_analysis.py
    │       └── test_parsing.py
├── benchmark_data
    └── .gitkeep
├── figures
    ├── agent_execution.png
    └── agent_loop.png
├── results
    └── interactive_output
    │   └── agent_outputs
    │       └── .gitkeep
├── runner.py
├── sandbox
    ├── Dockerfile
    ├── GOOGLE_APPLICATION_CREDENTIALS.json
    ├── base_requirements.txt
    └── configs
    │   ├── gitignore
    │   └── sandbox_bashrc
└── scripts
    └── install_swebench_harness.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | **/__pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | ### VisualStudioCode
132 | .vscode/*
133 | !.vscode/settings.json
134 | !.vscode/tasks.json
135 | !.vscode/launch.json
136 | !.vscode/extensions.json
137 | *.code-workspace
138 | **/.vscode
139 | 
140 | # JetBrains
141 | .idea/
142 | 
143 | # Data & Models
144 | *.h5
145 | *.tar
146 | *.tar.gz
147 | 
148 | # Lightning-Hydra-Template
149 | configs/local/default.yaml
150 | /data/
151 | /logs/
152 | .env
153 | 
154 | # Aim logging
155 | .aim
156 | 
157 | # Custom files and directories
158 | third_party
159 | benchmark_data/aiq_bench
160 | benchmark_data/file_editing_bench
161 | benchmark_data/symbol_location_bench
162 | check_boilerplate.sh
163 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.1.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 |   - family-names: Robeyns
 5 |     given-names: Maxime
 6 |     orcid: https://orcid.org/0000-0001-9802-9597
 7 |   - family-names: Szummer
 8 |     given-names: Martin
 9 |   - family-names: Laurence
10 |     given-names: Aitchison
11 | title: "Self-Improving Coding Agent"
12 | version: 0.0.1
13 | date-released: 2025-04-12
14 | repository-code: "https://github.com/MaximeRobeyns/self_improving_coding_agent"
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2025 Maxime Robeyns
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: test
 2 | 
 3 | PWD := $(shell pwd)
 4 | 
 5 | int:  ## Interactive run; uses default shell entrypoint
 6 | 	@echo 'Once in the container, type:'
 7 | 	@echo 'python -m agent_code.agent -s -p "<your prompt here>"'
 8 | 	@echo 'Watch the agent work on localhost:8080'
 9 | 	docker run --rm -ti \
10 | 		-p 8080:8080 \
11 | 		-v ${PWD}/base_agent:/home/agent/agent_code:ro \
12 | 		-v ${PWD}/results/interactive_output:/home/agent/workdir:rw \
13 | 		sica_sandbox
14 | 
15 | test:  ## Run the unit tests for the agent
16 | 	@pytest base_agent
17 | 
18 | image:  ## Docker image for x86_64
19 | 	@ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
20 | 	OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
21 | 	FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
22 | 	GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
23 | 	DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
24 | 	VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
25 | 	docker buildx build --build-context base_agent=./base_agent \
26 | 		-f sandbox/Dockerfile \
27 | 		-t sica_sandbox \
28 | 		--build-arg TARGET_ARCH=x86_64 \
29 | 		--build-arg ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
30 | 		--build-arg OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
31 | 		--build-arg FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
32 | 		--build-arg GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
33 | 		--build-arg DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
34 | 		--build-arg VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
35 | 		--load sandbox
36 | 
37 | image-mac:  ## Docker image for apple silicon
38 | 	@ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
39 | 	OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
40 | 	FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
41 | 	GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
42 | 	DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
43 | 	VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
44 | 	docker buildx build --build-context base_agent=./base_agent \
45 | 		-f sandbox/Dockerfile \
46 | 		-t sica_sandbox \
47 | 		--build-arg TARGET_ARCH=aarch64 \
48 | 		--build-arg ANTHROPIC_API_KEY=$${ANTHROPIC_API_KEY:-placeholder_anthropic_api_key} \
49 | 		--build-arg OPENAI_API_KEY=$${OPENAI_API_KEY:-placeholder_openai_api_key} \
50 | 		--build-arg FIREWORKS_AI_API_KEY=$${FIREWORKS_AI_API_KEY:-placeholder_fireworks_api_key} \
51 | 		--build-arg GEMINI_API_KEY=$${GEMINI_API_KEY:-placeholder_gemini_api_key} \
52 | 		--build-arg DEEPSEEK_API_KEY=$${DEEPSEEK_API_KEY:-placeholder_deepseek_api_key} \
53 | 		--build-arg VERTEX_PROJECT_ID=$${VERTEX_PROJECT_ID:-placeholder_vertex_project_id} \
54 | 		--load sandbox
55 | 
56 | docs:  ## Compile documentation
57 | 	python base_agent/utils/documentation.py base_agent > base_agent/DOCUMENTATION.md
58 | 
59 | meta:  ## Run the meta-agent agent directly for testing (see manual request in __main__.py)
60 | 	rm -rf results/meta
61 | 	mkdir -p results/meta/test_logs
62 | 	cp -r base_agent results/meta/agent_iter
63 | 	# Copy an existing archive so that the meta agent has something to work with
64 | 	cp -r results/run_1 results/meta/archive
65 | 	@echo localhost:8080
66 | 	docker run --rm -ti \
67 | 		-p 8080:8080 \
68 | 		-v ${PWD}/base_agent:/home/agent/meta:ro \
69 | 		-v ${PWD}/results/meta/archive:/home/agent/archive:ro \
70 | 		-v ${PWD}/results/meta/agent_iter:/home/agent/workdir:rw \
71 | 		-v ${PWD}/results/meta/test_logs:/home/agent/meta_logdir:rw \
72 | 		sica_sandbox python -m meta improve \
73 | 		--workdir /home/agent/workdir \
74 | 		--logdir /home/agent/meta_logdir
75 | 
76 | test_meta_int:  ## Interactivley test the resulting agent from the target above
77 | 	docker run --rm -ti \
78 | 		-p 8080:8080 \
79 | 		-p 8000:8000 \
80 | 		-v ${PWD}/results/meta/agent_iter:/home/agent/agent_code:ro \
81 | 		-v ${PWD}/results/meta/test_output:/home/agent/workdir:rw \
82 | 		sica_sandbox
83 | 
84 | 
85 | help:
86 | 	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <h1 align="center">Self-Improving Coding Agent</h1>
  3 |   <p align="center">A coding agent experiment, that works on its own codebase.</p>
  4 |   <p align="center">
  5 |     <img src="figures/agent_loop.png" alt="Agent Loop" width="80%"/>
  6 |   </p>
  7 | </p>
  8 | 
  9 | The system operates as an iterative improvement loop:
 10 | 1. evaluating the current agent version on some benchmark tasks to capture how well it does
 11 | 2. storing the results in an archive
 12 | 3. running the agent on its own codebase to work on an improvement
 13 | 4. going back to step 1 with the updated agent code
 14 | 
 15 | See [our workshop paper](https://openreview.net/pdf?id=rShJCyLsOr) for more details.
 16 | 
 17 | ## Quickstart
 18 | 
 19 | > IMPORTANT NOTE: always run the agent in the provided Docker container. Since the agent can execute shell commands, this offers some isolation from your host machine, avoiding inadvertent file system manipulation and similar risks.
 20 | 
 21 | First, make sure you've cloned the repo
 22 | ```bash
 23 | git clone https://github.com/MaximeRobeyns/self_improving_coding_agent
 24 | ```
 25 | 
 26 | Then, export some environment variables which will be made available in the
 27 | docker container. The project supports inference from a number of providers to
 28 | allow for experimentation across many models. You must export at least one of
 29 | these in your _local_ shell, which you can do either directly or with `direnv`,
 30 | `dotenv`, etc. Omitting any provider key will simply make that provider's
 31 | models unavailable to the agent.
 32 | 
 33 | ```bash
 34 | export ANTHROPIC_API_KEY=  # For Claude models
 35 | export OPENAI_API_KEY=  # For GPT 4o and reasoning models (o1, o3, etc)
 36 | export GEMINI_API_KEY=  # For Gemini models
 37 | export VERTEX_PROJECT_ID=  # For models hosted on GCP's Vertex
 38 | export FIREWORKS_AI_API_KEY=  # For DeepSeek / Llama hosted on fireworks
 39 | export DEEPSEEK_API_KEY=  # For DeepSeek direct inference (V3, R1)
 40 | export MODAL_TOKEN_ID=  # To allow the agent to visit webpages and read papers
 41 | export MODAL_TOKEN_SECRET=  # To allow the agent to visit webpages and read papers
 42 | ```
 43 | For gemini, you should replace the template file in `sandbox/GOOGLE_APPLICATION_CREDENTIALS.json` with your own credentials.
 44 | 
 45 | Once you have at least one LLM provider's API key exported, you can build the docker image. The build command is wrapped in a Makefile target for convenience:
 46 | 
 47 | ```bash
 48 | make image
 49 | ```
 50 | 
 51 | If you are using an apple silicon machine, use this target instead:
 52 | ```
 53 | make image-mac
 54 | ```
 55 | 
 56 | Finally, install the requirements in your local python environment:
 57 | ```bash
 58 | # remember to activate a virtual environment or equivalent here
 59 | pip install -r base_agent/requirements.txt
 60 | pip install swebench
 61 | ```
 62 | 
 63 | ### Testing the Agent
 64 | 
 65 | To test if the setup was successful, you can run the agent interactively with a manually set initial prompt using this target
 66 | ```bash
 67 | make int
 68 | ```
 69 | This will start the docker container and attach your shell to it. You can then run
 70 | ```bash
 71 | python -m agent_code.agent --server true -p "<some initial request here>"
 72 | ```
 73 | Then open your browser on http://localhost:8080 to follow the agent execution. This will show you an interactive webpage which visualises the events in the event bus / the agent callgraph, allowing you to click on individual events to see them in more detail, read overseer messages, and collapse sub-agent traces.
 74 | 
 75 | ![Agent Loop](figures/agent_execution.png)
 76 | 
 77 | The agent's working directory is mapped to `results/interactive_output` and any files created will be available here on your machine. Agent logs will be in `results/interactive_output/agent_output`.
 78 | 
 79 | You can see more options by doing
 80 | ```bash
 81 | make help
 82 | ```
 83 | or agent arguments wit
 84 | ```bash
 85 | python -m base_agent.agent --help
 86 | ```
 87 | 
 88 | To further configure the agent, including the choice of LLMs, edit `base_agent/src/config.py`.
 89 | 
 90 | ## Self-Improvement Loop
 91 | 
 92 | To run the self-improvement loop, first inspect the list of benchmarks in the `base_agent/src/benchmarks/__init__.py` file, and make sure that you have uncommented those you want to include. Then do
 93 | ```bash
 94 | python runner.py
 95 | ```
 96 | To see all the options, do
 97 | ```bash
 98 | python runner.py --help
 99 | ```
100 | Common options might be
101 | ```bash
102 | python runner.py --id 1 --workers 6
103 | ```
104 | 
105 | This will start the agent loop, placing the results in `results/run_<id>`.
106 | 
107 | ## Things to work on
108 | 
109 | Here are some potential things to try and do with the agent framework:
110 | 
111 | - [ ] get the agent to curate / build more of its own benchmarks
112 | - [ ] reduce the variance of self-improvement runs (early features often influence subsequent features)
113 | - [ ] use a stronger LLM to build a scaffold for a weaker LLM
114 | - [ ] find or create more realistic 'software engineering' benchmark tasks
115 | 
116 | ## Agent Description
117 | 
118 | The agent in `base_agent` is a minimal agent that can just about perform the
119 | meta-improvement task. It lacks efficient file editing tools, devtools such as
120 | tree sitter or LSP integrations, or advanced reasoning structures that would
121 | help it out when performing coding tasks. It has the necessary building blocks
122 | to bootstrap these features and specialise itself to the distribution of
123 | benchmark tasks included.
124 | 
125 | Please see `base_agent/README.md` for a more detailed discussion of the base agent framework.
126 | 
127 | ```
128 | ├── base_agent
129 | │   ├── agent_change_log.md
130 | │   ├── agent.py
131 | │   ├── conftest.py
132 | │   ├── description.txt
133 | │   ├── __main__.py
134 | │   ├── pytest.ini
135 | │   ├── README.md
136 | │   ├── requirements.txt
137 | │   ├── src
138 | │   │   ├── agents
139 | │   │   ├── benchmarks
140 | │   │   ├── callgraph
141 | │   │   ├── config.py
142 | │   │   ├── events
143 | │   │   ├── __init__.py
144 | │   │   ├── llm
145 | │   │   ├── oversight
146 | │   │   ├── schemas
147 | │   │   ├── tools
148 | │   │   ├── types
149 | │   │   ├── utils
150 | │   │   └── web_server
151 | │   └── tests
152 | │       ├── agents
153 | │       ├── benchmarks
154 | │       ├── events
155 | │       ├── __pycache__
156 | │       ├── test_example.py
157 | │       ├── tools
158 | │       └── utils
159 | ├── benchmark_data
160 | ├── results
161 | │   ├── run_<id>
162 | │   └── interactive_output
163 | ├── runner.py
164 | └── sandbox
165 | ```
166 | 
167 | ### Results Organization
168 | 
169 | ```
170 | results/run_{id}/
171 | ├── metadata.json          # Experiment metadata
172 | └── agent_{i}/             # Agent iteration directory
173 |     ├── agent_code/        # Agent implementation
174 |     ├── benchmarks/        # Benchmark results
175 |     │   └── {bench_name}/
176 |     │       ├── results.jsonl  # Per-problem results
177 |     │       ├── perf.jsonl     # Summary metrics
178 |     │       └── traces/        # Detailed traces
179 |     └── meta_improvement/  # Improvement logs
180 | ```
181 | 
182 | ## Citation
183 | 
184 | ```
185 | @inproceedings{
186 |     robeyns2025sica,
187 |     title={{SICA} A Self-Improving Coding Agent},
188 |     author={Maxime Robeyns, Martin Szummer, and Laurence Aitchison},
189 |     booktitle={ICLR 2025 Workshop on Scaling Self-Improving Foundation Models},
190 |     year={2025},
191 |     url={https://openreview.net/forum?id=rShJCyLsOr}
192 | }
193 | ```
194 | 


--------------------------------------------------------------------------------
/base_agent/.gitignore:
--------------------------------------------------------------------------------
1 | ENV_VARS
2 | 


--------------------------------------------------------------------------------
/base_agent/agent_change_log.md:
--------------------------------------------------------------------------------
 1 | # Agent Codebase Change Log
 2 | 
 3 | | Iteration | Change Name | Was Successful? (pending/yes/no) |
 4 | |-----------|-------------|----------------------------------|
 5 | | 0         | Base Agent  | yes                              |
 6 | 
 7 | 
 8 | ## Iteration 0: Base Agent
 9 | 
10 | This is a template iteration which you should follow for following iterations.
11 | 
12 | ### Feature Description
13 | 
14 | This is to be written at iteration i (in this case, i=0). Describe the intention / motivation / hypothesis behind the change made.
15 | 
16 | ### Feature Outcome
17 | 
18 | This part is supposed to be written at iteration i + 1 (and potentially updated at subsequent iterations), and comments on the empirical effectiveness of the change.
19 | 
20 | ## Iteration 1: <new feature here>
21 | 


--------------------------------------------------------------------------------
/base_agent/conftest.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | import pytest
 7 | 
 8 | # Enable asyncio support for pytest
 9 | pytest_plugins = ["pytest_asyncio"]
10 | 
11 | # Optional: Define custom command-line options for your markers
12 | def pytest_addoption(parser):
13 |     parser.addoption(
14 |         "--run-llm",
15 |         action="store_true",
16 |         default=False,
17 |         help="Run tests marked with 'uses_llm'",
18 |     )
19 |     parser.addoption(
20 |         "--run-slow",
21 |         action="store_true",
22 |         default=False,
23 |         help="Run tests marked with 'slow'",
24 |     )
25 | 
26 | # Skip tests based on markers unless the corresponding option is provided
27 | def pytest_collection_modifyitems(config, items):
28 |     if not config.getoption("--run-llm"):
29 |         skip_llm = pytest.mark.skip(reason="need --run-llm option to run")
30 |         for item in items:
31 |             if "uses_llm" in item.keywords:
32 |                 item.add_marker(skip_llm)
33 |     if not config.getoption("--run-slow"):
34 |         skip_slow = pytest.mark.skip(reason="need --run-slow option to run")
35 |         for item in items:
36 |             if "slow" in item.keywords:
37 |                 item.add_marker(skip_slow)
38 | 


--------------------------------------------------------------------------------
/base_agent/description.txt:
--------------------------------------------------------------------------------
1 | This is the base, v0 agent that is used as a starting point.
2 | 


--------------------------------------------------------------------------------
/base_agent/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | pythonpath = .
 3 | # --ff for previously failed first
 4 | # -l for print state on failure
 5 | # -x for stop on first failure
 6 | # -s for show stdout while testing
 7 | # -v for verbose (e.g. show test names)
 8 | # -n for n threadsafe parallel workers
 9 | addopts = -l -x --ff -s -v
10 | testpaths = tests
11 | filterwarnings = ignore::DeprecationWarning
12 | asyncio_default_fixture_loop_scope = function
13 | markers =
14 |     uses_llm: marks tests as using llms (run with '--run-llm')
15 |     asyncio: marks tests as asynchronous
16 |     integration: marks tests as integration tests
17 |     slow: marks tests that run slowly
18 |     performance: marks tests that benchmark performance (run with '-m performance')
19 | 


--------------------------------------------------------------------------------
/base_agent/requirements.txt:
--------------------------------------------------------------------------------
 1 | jsonlines
 2 | cryptography
 3 | datasets
 4 | tiktoken
 5 | pydantic[email]
 6 | pydantic-settings
 7 | python-dotenv
 8 | anthropic[vertex]==0.42.0
 9 | tabulate
10 | openai
11 | json-repair
12 | rich
13 | jinja2
14 | fastapi
15 | uvicorn[standard]
16 | GitPython
17 | diff-match-patch
18 | swebench
19 | duckduckgo-search
20 | scipy
21 | sympy
22 | google-genai
23 | googlesearch-python
24 | pytest
25 | pytest-asyncio
26 | google-cloud-aiplatform
27 | 


--------------------------------------------------------------------------------
/base_agent/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/__init__.py


--------------------------------------------------------------------------------
/base_agent/src/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | The agents module defines the agents that can be composed and called to
 8 | construct the broader scaffolding system. An agent might be thought of as a
 9 | function in a program in the sense that they can be invoked, invoke other
10 | agents themselves, be composed and so forth - indeed, we maintain a 'callgraph'
11 | of the agent calls in the system.
12 | 
13 | Individually, an "agent" is just a class that is used to carefully compose an
14 | LLM's context. The way the LLM sees the context is as follows:
15 | 
16 | - a system prompt section, in which the "agent's" definition, goals, and
17 |   available tools and sub-agents are defined
18 | - the first "user" message, referred to as the core prompt section, which is
19 |   defined by the agent itself and which pertains to the way in which the agent
20 |   should go about its execution; what sequence of steps it should follow, what
21 |   it should focus on, what outcomes it should try to achieve. This is also
22 |   where we put visualisations of system state such as file trees and file
23 |   viewers.
24 | - the "assistant" message, which contains the agent's response and consists of
25 |   alternating sequences of thought and tool or sub-agent calls. The
26 |   'function calling interface' for tools and sub-agents is very similar:
27 |   consisting of an XML sequence whose last closing tag is a stop token. After
28 |   being generated, the LLM will stop, the contents of the XML will be pasrsed
29 |   to identify the tool or sub-agent name, and the arguments provided, these
30 |   will be validated, the tool or sub-agent will be run, and the response will
31 |   be serialised. These will then be concatenated to the previously generated
32 |   assistant message, and the LLM will be called again with this as the
33 |   assistant "pre-fill".
34 | 
35 | Note, the way this is implemented, and the programming model to maintain, is
36 | that each agent maintains an 'event stream', published to the event bus. This
37 | is a list of events (such as new assistant messages, tool calls and results,
38 | agent calls and results, file events, overseer notifications and so forth)
39 | which describes the exection of the agent. The assistant message is
40 | reconstructed by filtering this event stream and concatenating the values. At a
41 | basic level, just the assistant messages and tool / agent results can be
42 | concatenated, although other event types can be included. For instance, the
43 | file open event may also be included here (with a view of the file content) in
44 | order to save re-generating the core prompt, which would cause a KV cache miss.
45 | By only appending to the LLM agent's context, we can avoid breaking to the
46 | cache, at the cost of lengthening it and potentially duplicating content -
47 | eventually it becomes more cost effective to consolidate all this file state
48 | into the core prompt, shorten the prompt yet re-calculate the KV cache.
49 | 
50 | Also note that overseer notification events are handled slightly differently.
51 | When reconstructing the event stream, we stop the current assistant message,
52 | add the overseer notification in a new 'user' message, before continuing with
53 | the rest of the events in a new assistant pre-fill message.
54 | """
55 | 


--------------------------------------------------------------------------------
/base_agent/src/agents/implementations/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """Built-in agent agents providing core capabilities."""
 7 | 
 8 | from ..base_agent import BaseAgent, AgentResult
 9 | 
10 | 
11 | class DemoAgent(BaseAgent):
12 |     """Agent for constructing examples in tools"""
13 | 
14 |     AGENT_NAME = "demo_agent"
15 |     AGENT_DESCRIPTION = "a dummy agent for demonstration"
16 |     SYSTEM_PROMPT = ""
17 | 
18 |     async def construct_core_prompt(self) -> str:
19 |         return ""
20 | 
21 |     @classmethod
22 |     def generate_examples(cls) -> list[tuple["BaseAgent", AgentResult]]:
23 |         return []
24 | 


--------------------------------------------------------------------------------
/base_agent/src/agents/implementations/problem_solver.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | from typing import List
  7 | from pathlib import Path
  8 | from pydantic import Field
  9 | 
 10 | 
 11 | from .reasoner import ReasoningAgent
 12 | from ..base_agent import BaseAgent
 13 | from ...config import settings
 14 | from ...tools.calculator import Calculator
 15 | from ...tools.directory_tools import ViewDirectory
 16 | from ...tools.execute_command import ExecuteCommand
 17 | from ...tools.file_tools import OpenFile, CloseFile
 18 | from ...tools.edit_tools import OverwriteFile
 19 | from ...tools.ripgrep_tool import RipGrepTool
 20 | from ...tools.committee_design import ReviewCommittee
 21 | from .coder import CodingAgent
 22 | from ...utils.metrics import make_random_agent_metrics
 23 | from ...types.agent_types import AgentStatus, AgentResult
 24 | from ...events.event_bus_utils import get_problem_statement
 25 | 
 26 | 
 27 | class ProblemSolvingAgent(BaseAgent):
 28 |     """
 29 |     A multi-purpose problem-solving agent with access to all tools and capabilities.
 30 | 
 31 |     This agent can:
 32 |     1. Analyze and decompose complex problems
 33 |     2. Plan and execute solutions systematically
 34 |     3. Use a wide range of tools and agents
 35 |     4. Validate and refine solutions
 36 |     5. Handle errors and edge cases
 37 |     6. Document and explain its process
 38 |     """
 39 | 
 40 |     AGENT_NAME = "general_problem_solver"
 41 | 
 42 |     AGENT_DESCRIPTION = """
 43 | Your default agent for all tasks. Highly versatile, with broad tool access. Best for tasks requiring multiple capabilities or when specific agent choice isn't obvious.
 44 | 
 45 | Note that the agent will not have the context that you have / be able to see the initial problem statement verbatim. It is up to you to accurately relay this to the sub-agent, or decompose it into sub-tasks if it is very long and repeating it verbatim would be slow and costly.
 46 | 
 47 | Example capabilities
 48 | - Problem decomposition and analysis
 49 | - General purpose writing tasks
 50 | - Basic Coding (although not specialised)
 51 | - Quick System and file operations
 52 | - Mathematical computation
 53 | - Running shell commands
 54 | 
 55 | Choose when:
 56 | - Specific agent isn't clearly better
 57 | - Need flexible approach
 58 | 
 59 | Avoid when:
 60 | - Task fits squarely in another agent's specialty
 61 | - Requires deep domain expertise"""
 62 | 
 63 |     SYSTEM_PROMPT = """You are a very-competent problem solver who finds solutions swiftly and effectively.
 64 | 
 65 | You should
 66 | 1. Understand the sense of the problem you have been provided
 67 | 2. Identify the optimal tools and methods you can use to solve your task
 68 | 3. Swiftly execute on the problem
 69 | 4. Continuously validate and check your work
 70 | 
 71 | Aim for simple, elegant and correct solutions.
 72 | """
 73 | 
 74 |     # Available tools - complete access to all tools
 75 |     # NOTE: ExitAgent and ReturnResult are automatically included
 76 |     AVAILABLE_TOOLS = {
 77 |         Calculator,
 78 |         ViewDirectory,
 79 |         ExecuteCommand,
 80 |         OpenFile,
 81 |         CloseFile,
 82 |         OverwriteFile,
 83 |         RipGrepTool,
 84 |         ReviewCommittee,
 85 |     }
 86 | 
 87 |     # Available agents
 88 |     # AVAILABLE_AGENTS = set()
 89 |     AVAILABLE_AGENTS = {ReasoningAgent, CodingAgent}
 90 | 
 91 |     HAS_FILEVIEW = True
 92 | 
 93 |     MODEL = settings.MODEL
 94 |     TEMPERATURE = 0.666
 95 | 
 96 |     # Agent parameters
 97 |     problem_statement: str = Field(
 98 |         ...,
 99 |         description="The problem or request you want the problem solver agent to solve",
100 |     )
101 |     previous_agent_runs: List[str] = Field(
102 |         default=[],
103 |         description="A list of descriptions of previous work undertaken by other agents, the context from which this agent would benefit from knowing. This helps to avoid duplicate work.",
104 |     )
105 |     requirements: List[str] = Field(
106 |         default=[],
107 |         description="A list of very specific and low-level criteria which must be met or become valid for the sub-agent to consider its work done.",
108 |     )
109 | 
110 |     def __init__(
111 |         self,
112 |         parent: BaseAgent | None = None,
113 |         workdir: Path | None = None,
114 |         logdir: Path | None = None,
115 |         debug_mode: bool = False,
116 |         **data,
117 |     ):
118 |         super().__init__(
119 |             parent=parent, workdir=workdir, logdir=logdir, debug_mode=debug_mode, **data
120 |         )
121 | 
122 |     async def construct_core_prompt(self) -> str:
123 |         """Construct the core prompt for problem solving."""
124 | 
125 |         # initial_request = await get_problem_statement()
126 |         # if initial_request is None or initial_request == "":
127 |         #     raise ValueError(
128 |         #         "The initial request was not provided to the problem solver"
129 |         #     )
130 | 
131 |         prompt = f"""Here is the problem you have been asked to solve:
132 | 
133 | <problme_to_solve>
134 | {self.problem_statement}
135 | </problem_to_solve>
136 | """
137 | 
138 |         if self.previous_agent_runs:
139 |             prompt += "\n\nWork Previously Completed:"
140 |             prompt += "\nYou should pay attention to this list to avoid duplicating work. Also note that this list is for work completed by other agents, which aren't 100% reliable, so treat claims with appropriate caution, and verify accordingly."
141 |             for work in self.previous_agent_runs:
142 |                 prompt += f"\n- {work}"
143 | 
144 |         if self.requirements:
145 |             prompt += "\n\nSpecific requirements which must be met before you can consider the work 'done':"
146 |             for req in self.requirements:
147 |                 prompt += f"\n- {req}"
148 | 
149 |         prompt += "\n\nReturn your answer when complete."
150 | 
151 |         return prompt
152 | 
153 |     @classmethod
154 |     def generate_examples(cls) -> list[tuple["BaseAgent", AgentResult]]:
155 |         """Generate example uses of the tool with their expected outputs."""
156 |         examples = [
157 |             # Example 1: Mathematical Problem Solving
158 |             (
159 |                 cls(
160 |                     problem_statement="""Solve the following system of equations:
161 | 3x + 2y = 12
162 | x - y = 1""",
163 |                     requirements=[
164 |                         "Show the full answer derivation",
165 |                         "Verify the solution numerically using Python",
166 |                     ],
167 |                 ),
168 |                 AgentResult(
169 |                     agent_name=cls.AGENT_NAME,
170 |                     status=AgentStatus.SUCCESS,
171 |                     result="""Solution found: x = 4, y = 3
172 | 
173 | Process:
174 | 1. Used elimination method
175 | 2. Verified by substitution in a Python script
176 | 3. Checked both equations
177 | 4. All validation criteria met""",
178 |                     metrics=make_random_agent_metrics(
179 |                         tools_enabled=True, agents_enabled=True
180 |                     ),
181 |                 ),
182 |             ),
183 |             # Example 2: Code Analysis and Modification
184 |             #                   (
185 |             #                 cls(
186 |             #                     problem_statement="""Fix the performance issue in process_data() agent:
187 |             #
188 |             # - Current implementation uses O(n²) time
189 |             # - Need to optimize to O(n) complexity
190 |             # - Maintain existing API contract""",
191 |             #                     requirements=[
192 |             #                         "Keep existing agent signature",
193 |             #                         "Maintain thread safety",
194 |             #                         "Add performance tests",
195 |             #                     ],
196 |             #                 ),
197 |             #                 AgentResult(
198 |             #                     agent_name=cls.AGENT_NAME,
199 |             #                     status=AgentStatus.SUCCESS,
200 |             #                     result="""Optimized process_data() agent:
201 |             #
202 |             # 1. Analyzed existing implementation
203 |             # 2. Identified quadratic loop pattern
204 |             # 3. Refactored to use hash table
205 |             # 4. Added performance tests
206 |             # 5. Verified thread safety
207 |             # 6. Maintained API compatibility
208 |             #
209 |             # Performance improved:
210 |             # - Before: O(n²) time, O(1) space
211 |             # - After: O(n) time, O(n) space
212 |             # - Verified with test suite""",
213 |             #                     metrics=make_random_agent_metrics(
214 |             #                         tools_enabled=True, agents_enabled=True
215 |             #                     ),
216 |             #                 ),
217 |             #             ),
218 |         ]
219 |         return examples
220 | 


--------------------------------------------------------------------------------
/base_agent/src/agents/implementations/review_committee_member.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | from pathlib import Path
  7 | from pydantic import Field
  8 | 
  9 | 
 10 | from ..base_agent import BaseAgent
 11 | from ...config import settings
 12 | from ...tools.directory_tools import ViewDirectory
 13 | from ...tools.file_tools import OpenFile, CloseFile
 14 | from ...tools.ripgrep_tool import RipGrepTool
 15 | from ...utils.metrics import make_random_agent_metrics
 16 | from ...types.agent_types import AgentStatus, AgentResult
 17 | from ...types.llm_types import Model
 18 | from .reasoner import ReasoningAgent
 19 | 
 20 | 
 21 | class CommitteeMember(BaseAgent):
 22 |     """
 23 |     A simple review committee agent, with read-only access to the project.
 24 |     """
 25 | 
 26 |     AGENT_NAME = "meta_agent_design_reviewer"
 27 | 
 28 |     AGENT_DESCRIPTION = """A meta-agent design review committee member. Called from the committee_design tool."""
 29 | 
 30 |     SYSTEM_PROMPT = """You are a member of a Meta-Agent design review committee, tasked with evaluating a coding agent's design proposal, about how to improve a coding agent system, before it begins work on the implementation. Your role is to provide a detailed, constructive and reasonable critique that ensures the proposed design avoids commonly identified pathologies in coding agent design, and is robust, practical, and aligned with the goals of the self-improving coding agent.
 31 | 
 32 | Approach the review with a critical yet collaborative mindset, drawing on established engineering principles such as simplicity (delete unnecessary parts), conceptual integrity (a cohesive whole), and testability.
 33 | 
 34 | You must ensure that the design is grounded in making the coding agent system better at writing softare, advocating for things like
 35 | - improving the mechanics of writing the code files: more efficient file editing strategies and tools
 36 | - building reasoning and organisational structures which guide the agent to generate better code
 37 | - things which improve the speed with which the agent is able to complete code tasks
 38 | - features which improve the quality of the written code: such as improving the generated code's formatting and structure, utillities for robust and efficient testing, or which enhance the maintainability of the code
 39 | 
 40 | Focus on the following desiderata:
 41 | - Clarity: Is the proposal understandable and well-articulated?
 42 | - Feasibility: Can it be realistically implemented given constraints?
 43 | - Robustness: Does it handle real-world challenges (e.g., edge cases, failures)?
 44 | - Quality: Does it reflect good design and testing practices for long-term value?
 45 | - Grounding: Is it supported by executable feedback (e.g., tests) to verify its claims?
 46 | 
 47 | Provide a structured evaluation: identify strengths, flag weaknesses, and suggest actionable improvements. Avoid vague or frivolous feedback-every critique should tie back to the project's success. Your specialized role will guide your focus, but always consider the proposal as a whole."""
 48 | 
 49 |     # Available tools
 50 |     # NOTE: ExitAgent and ReturnResult are automatically included
 51 |     # We limit ourselves to 'read only' tools.
 52 |     AVAILABLE_TOOLS = {
 53 |         ViewDirectory,
 54 |         OpenFile,
 55 |         CloseFile,
 56 |         RipGrepTool,
 57 |     }
 58 | 
 59 |     # Available agents
 60 |     # AVAILABLE_AGENTS = {ReasoningAgent}
 61 |     AVAILABLE_AGENTS = set()
 62 | 
 63 |     HAS_FILEVIEW = True
 64 | 
 65 |     MODEL = settings.MODEL
 66 |     TEMPERATURE = 0.666
 67 | 
 68 |     # Agent parameters
 69 |     proposal: str = Field(
 70 |         ...,
 71 |         description="The full proposal to review",
 72 |     )
 73 |     context: str = Field(
 74 |         ...,
 75 |         description="The motivation and context for understanding the plan",
 76 |     )
 77 |     specialisation: str = Field(
 78 |         ..., description="The specialisation of this committee member"
 79 |     )
 80 |     model: Model = Field(default=Model.SONNET_35)
 81 | 
 82 |     def __init__(
 83 |         self,
 84 |         parent: BaseAgent | None = None,
 85 |         workdir: Path | None = None,
 86 |         logdir: Path | None = None,
 87 |         debug_mode: bool = False,
 88 |         **data,
 89 |     ):
 90 |         super().__init__(
 91 |             parent=parent, workdir=workdir, logdir=logdir, debug_mode=debug_mode, **data
 92 |         )
 93 | 
 94 |     async def construct_core_prompt(self) -> str:
 95 |         """Construct the core prompt for the committee member."""
 96 | 
 97 |         prompt = f"""{self.specialisation}
 98 | 
 99 | Here is the agent's self-provided goals and context surrounding the plan
100 | <goals_and_context>
101 | {self.context}
102 | </goals_and_context>
103 | 
104 | Here is the design proposal you have been asked to review:
105 | 
106 | <proposal>
107 | {self.proposal}
108 | </proposal>
109 | 
110 | You should read the README.md file first to get the full context of this self-improving coding agent project.
111 | You should then view the agent_change_log.md to get an idea of what (if anything) has already been tried by the coding agent as it attempts to improve itself, as measured by the benchmark performance.
112 | You can also quickly view any other code files that you need to get context on the proposal.
113 | 
114 | Then, craft your review. Don't spend too long opening other files and doing research. Move swiftly. Note that you MUST provide your full review in the return_result tool since this is how it is communicated back. Anything not put in the return_result tool will not be seen by the agent.
115 | 
116 | DO NOT attempt the task yourself, and avoid calling tools unless you absolutely need to. Then, simply provide your review in the return_result tool and complete.
117 | """
118 | 
119 |         return prompt
120 | 
121 |     @classmethod
122 |     def generate_examples(cls) -> list[tuple["CommitteeMember", AgentResult]]:
123 |         """Generate example uses of the tool with their expected outputs.
124 | 
125 |         Note that the committee member is deterministically invoked (for now)
126 |         so these examples won't be used.
127 |         """
128 |         examples = []
129 |         return examples
130 | 


--------------------------------------------------------------------------------
/base_agent/src/benchmarks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | from typing import Type
 7 | from collections import OrderedDict
 8 | 
 9 | from .base import BaseBenchmark
10 | from .gpqa import GPQABenchmark
11 | from .aime import AIMEBenchmark
12 | from .drop import DROPBenchmark
13 | from .math import MATHBenchmark
14 | from .gsm8k import GSM8KBenchmark
15 | from .gsm_ic import GSMICBenchmark
16 | from .refute import RefuteBenchmark
17 | from .arc_agi import ARCAGIBenchmark
18 | from .humaneval import HumanEvalBenchmark
19 | from .file_editing import FileEditingBenchmark
20 | from .aiq_benchmark import AIQBenchmark
21 | from .livecodebench import LiveCodeBenchmark
22 | from .symbol_location import SymbolLocationBenchmark
23 | from .swebench_verified import SWEBenchBenchmark
24 | from .aiq_project_benchmarks import (
25 |     LinalgAIQBenchmark,
26 |     CSVParsingAIQBenchmark,
27 |     MessagingAppAIQBenchmark,
28 |     DistKVStoreAIQBenchmark,
29 | )
30 | 
31 | # Important, append new benchmarks to the end of this
32 | benchmark_registry: OrderedDict[str, Type[BaseBenchmark]] = OrderedDict(
33 |     [
34 |         (GSM8KBenchmark.name, GSM8KBenchmark),
35 |         # (DROPBenchmark.name, DROPBenchmark),
36 |         # (ARCAGIBenchmark.name, ARCAGIBenchmark),
37 |         # (MATHBenchmark.name, MATHBenchmark),
38 |         # (GSMICBenchmark.name, GSMICBenchmark),
39 |         # (FileEditingBenchmark.name, FileEditingBenchmark),
40 |         # (SWEBenchBenchmark.name, SWEBenchBenchmark),
41 |         # (HumanEvalBenchmark.name, HumanEvalBenchmark),
42 |         # (AIMEBenchmark.name, AIMEBenchmark),
43 |         # (GPQABenchmark.name, GPQABenchmark),
44 |         # (LiveCodeBenchmark.name, LiveCodeBenchmark),
45 |         # (SymbolLocationBenchmark.name, SymbolLocationBenchmark),
46 |         # (RefuteBenchmark.name, RefuteBenchmark),
47 |         # (AIQBenchmark.name, AIQBenchmark),
48 |         # (LinalgAIQBenchmark.name, LinalgAIQBenchmark),
49 |         # (CSVParsingAIQBenchmark.name, CSVParsingAIQBenchmark),
50 |         # (MessagingAppAIQBenchmark.name, MessagingAppAIQBenchmark),
51 |         # (DistKVStoreAIQBenchmark.name, DistKVStoreAIQBenchmark),
52 |     ]
53 | )
54 | 


--------------------------------------------------------------------------------
/base_agent/src/benchmarks/aime.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import random
  7 | import logging
  8 | 
  9 | from pathlib import Path
 10 | from datasets import load_dataset
 11 | from dataclasses import dataclass
 12 | 
 13 | from .base import BaseBenchmark, Problem
 14 | 
 15 | logging.basicConfig(level=logging.INFO)
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | @dataclass
 20 | class AIMEExample:
 21 |     """A single AIME example."""
 22 |     problem_id: str  # e.g., "2024-I-1"
 23 |     problem: str
 24 |     solution: str
 25 |     answer: int
 26 | 
 27 |     @classmethod
 28 |     def from_raw(cls, example: dict) -> "AIMEExample":
 29 |         """Create an AIMEExample from a raw dataset example."""
 30 |         return cls(
 31 |             problem_id=str(example["ID"]),
 32 |             problem=example["Problem"].strip(),
 33 |             solution=example["Solution"].strip(),
 34 |             answer=int(example["Answer"])  # AIME answers are always integers
 35 |         )
 36 | 
 37 | 
 38 | class AIMEBenchmark(BaseBenchmark):
 39 |     """Benchmark for the American Invitational Mathematics Examination (AIME) 2024 dataset.
 40 | 
 41 |     The AIME is a prestigious high school mathematics competition known for its challenging
 42 |     mathematical problems. All answers in AIME are integers.
 43 |     """
 44 | 
 45 |     name = "aime"
 46 | 
 47 |     def __init__(self, seed: int | None = 1, subset_size: int | None = 20):
 48 |         super().__init__(seed, subset_size)
 49 | 
 50 |         # Load dataset from HuggingFace
 51 |         dataset = load_dataset("Maxwell-Jia/AIME_2024")
 52 |         self.test_data = [AIMEExample.from_raw(ex) for ex in dataset["train"]]  # Dataset only has train split
 53 | 
 54 |         # Create randomized subset if requested
 55 |         if subset_size is not None:
 56 |             random.seed(seed)
 57 |             self.test_data = random.sample(self.test_data, subset_size)
 58 | 
 59 |         # Convert to Problem instances
 60 |         self._data = [
 61 |             Problem(
 62 |                 problem_id=ex.problem_id,
 63 |                 statement=ex.problem,
 64 |                 answer=ex.answer,
 65 |                 answer_discussion=ex.solution,
 66 |             )
 67 |             for ex in self.test_data
 68 |         ]
 69 | 
 70 |     @property
 71 |     def problems(self) -> list[Problem]:
 72 |         return self._data
 73 | 
 74 |     async def score_problem(
 75 |         self,
 76 |         problem: Problem,
 77 |         agent_workdir: str,
 78 |         agent_answer_dir: str,
 79 |         container_name: str,
 80 |     ) -> tuple[float, str | None, str | None]:
 81 |         """Score the answer to the problem.
 82 | 
 83 |         Since AIME answers are always integers, we can do exact matching without
 84 |         any floating-point comparison.
 85 | 
 86 |         Returns:
 87 |             tuple of:
 88 |             - score (0.0 or 1.0)
 89 |             - error message (if any)
 90 |             - solution discussion
 91 |         """
 92 |         try:
 93 |             answer_path = Path(agent_answer_dir) / "answer.txt"
 94 |             llm_answer = answer_path.read_text().strip()
 95 | 
 96 |             # Clean the answer by removing any commas and whitespace
 97 |             llm_answer = llm_answer.replace(",", "").replace(" ", "")
 98 | 
 99 |             # Convert to integer and compare exactly
100 |             try:
101 |                 answer_int = int(llm_answer)
102 |                 if answer_int == problem.answer:
103 |                     return 1.0, None, problem.answer_discussion
104 |                 return 0.0, None, problem.answer_discussion
105 |             except ValueError:
106 |                 return 0.0, "Answer must be an integer", problem.answer_discussion
107 | 
108 |         except Exception as e:
109 |             logger.debug(f"Error in AIME scoring: {e}")
110 |             return 0.0, str(e), problem.answer_discussion
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     import tempfile
115 | 
116 |     def run_test_case(benchmark: AIMEBenchmark, answer_dir: Path,
117 |                       ground_truth: int, agent_answer: str, should_pass: bool):
118 |         """Helper function to run a single test case"""
119 |         print(f"\nTESTING: '{ground_truth}' vs '{agent_answer}' (should_pass={should_pass})")
120 | 
121 |         # Use first problem as template but override answer
122 |         problem = benchmark.problems[0]
123 |         problem.answer = ground_truth
124 |         problem.answer_discussion = "Test discussion"
125 | 
126 |         answer_file = answer_dir / "answer.txt"
127 |         answer_file.write_text(agent_answer)
128 | 
129 |         score, error, _ = benchmark.score_problem(
130 |             problem, str(answer_dir.parent), str(answer_dir), "test"
131 |         )
132 | 
133 |         assert score == (1.0 if should_pass else 0.0), \
134 |             f"Failed: '{ground_truth}' vs '{agent_answer}' got {score}, expected {1.0 if should_pass else 0.0}"
135 |         if error:
136 |             print(f"Error message: {error}")
137 | 
138 |     # Create test environment
139 |     benchmark = AIMEBenchmark()
140 | 
141 |     with tempfile.TemporaryDirectory() as tmpdir:
142 |         answer_dir = Path(tmpdir) / "answers"
143 |         answer_dir.mkdir()
144 | 
145 |         print("\nTesting basic integer answers...")
146 |         test_cases = [
147 |             (42, "42", True),
148 |             (42, "42.0", False),  # Must be exact integer
149 |             (1000, "1,000", True),  # Allow commas
150 |             (1000, "1000", True),
151 |             (1000, " 1000 ", True),  # Allow whitespace
152 |             (42, "abc", False),  # Non-numeric
153 |             (-123, "-123", True),  # Negative numbers
154 |             (0, "0", True),
155 |             (0, "0.0", False),
156 |             (42, "41", False),  # Wrong answer
157 |         ]
158 |         for truth, pred, should_pass in test_cases:
159 |             run_test_case(benchmark, answer_dir, truth, pred, should_pass)
160 | 
161 |         # Test that the dataset loads correctly
162 |         print("\nTesting dataset loading...")
163 |         assert len(benchmark.problems) > 0, "Dataset should not be empty"
164 |         assert all(isinstance(p.answer, int) for p in benchmark.problems), \
165 |             "All answers should be integers"
166 |         assert all(isinstance(p.problem_id, str) for p in benchmark.problems), \
167 |             "All problem IDs should be strings"
168 |         assert all(p.problem_id.startswith("2024-") for p in benchmark.problems), \
169 |             "All problem IDs should start with 2024-"
170 | 
171 |         # Test subset functionality
172 |         print("\nTesting subset functionality...")
173 |         subset_size = 5
174 |         benchmark_subset = AIMEBenchmark(seed=42, subset_size=subset_size)
175 |         assert len(benchmark_subset.problems) == subset_size, \
176 |             f"Subset size should be {subset_size}, got {len(benchmark_subset.problems)}"
177 | 
178 |         # Test seed reproducibility
179 |         print("\nTesting seed reproducibility...")
180 |         benchmark_subset1 = AIMEBenchmark(seed=42, subset_size=subset_size)
181 |         benchmark_subset2 = AIMEBenchmark(seed=42, subset_size=subset_size)
182 |         assert [p.problem_id for p in benchmark_subset1.problems] == \
183 |                [p.problem_id for p in benchmark_subset2.problems], \
184 |             "Same seed should produce same subset"
185 | 
186 |         # Test different seeds produce different subsets
187 |         benchmark_subset3 = AIMEBenchmark(seed=43, subset_size=subset_size)
188 |         assert [p.problem_id for p in benchmark_subset1.problems] != \
189 |                [p.problem_id for p in benchmark_subset3.problems], \
190 |             "Different seeds should produce different subsets"
191 | 
192 |         print("\nAll tests passed! ✨")
193 | 


--------------------------------------------------------------------------------
/base_agent/src/benchmarks/base.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import jsonlines
  7 | 
  8 | from abc import abstractmethod
  9 | from typing import Any, ClassVar
 10 | from pathlib import Path
 11 | from datetime import datetime
 12 | from dataclasses import dataclass, asdict
 13 | 
 14 | 
 15 | @dataclass
 16 | class Problem:
 17 |     """A single benchmark problem, containing a problem_id, problem statement and answer"""
 18 | 
 19 |     problem_id: str
 20 |     statement: str
 21 |     answer: Any
 22 |     answer_discussion: str | None
 23 | 
 24 | 
 25 | @dataclass
 26 | class ProblemResult:
 27 |     """Complete record of a single problem attempt"""
 28 | 
 29 |     problem_id: str
 30 |     timestamp: str | None = None
 31 |     score: float | None = None
 32 |     tokens_used: int | None = None
 33 |     num_cached_tokens: int | None = None
 34 |     cost_estimate: float | None = None
 35 |     wall_time: float | None = None
 36 |     timed_out: bool = False
 37 |     cost_threshold_exceeded: bool = False
 38 | 
 39 |     def is_complete(self) -> bool:
 40 |         # Considered complete if it has been scored
 41 |         return self.score is not None
 42 | 
 43 |     def update(self, **kwargs) -> None:
 44 |         for key, value in kwargs.items():
 45 |             if hasattr(self, key):
 46 |                 setattr(self, key, value)
 47 |             else:
 48 |                 raise ValueError(f"Invalid field {key} in ProblemResult update")
 49 | 
 50 | 
 51 | class BenchmarkTracker:
 52 |     def __init__(self, results_path: Path):
 53 |         self.results_path = results_path
 54 |         self.results: dict[str, ProblemResult] = self._load_or_create()
 55 | 
 56 |     def _load_or_create(self) -> dict[str, ProblemResult]:
 57 |         results = {}
 58 |         if self.results_path.exists():
 59 |             with jsonlines.open(self.results_path) as reader:
 60 |                 for line in reader:
 61 |                     results[line["problem_id"]] = ProblemResult(**line)
 62 |         return results
 63 | 
 64 |     def start_problem(self, problem_id: str) -> None:
 65 |         result = ProblemResult(
 66 |             problem_id=problem_id, timestamp=datetime.now().isoformat()
 67 |         )
 68 |         self.results[problem_id] = result
 69 |         with jsonlines.open(self.results_path, mode="a") as writer:
 70 |             writer.write(asdict(result))
 71 | 
 72 |     def update_problem(self, problem_id: str, **kwargs) -> None:
 73 |         if problem_id not in self.results:
 74 |             raise KeyError(f"Problem {problem_id} not found")
 75 | 
 76 |         self.results[problem_id].update(**kwargs)
 77 | 
 78 |         # Rewrite the file with updated results
 79 |         with jsonlines.open(self.results_path, mode="w") as writer:
 80 |             writer.write_all(asdict(result) for result in self.results.values())
 81 | 
 82 | 
 83 | class BaseBenchmark:
 84 | 
 85 |     name: ClassVar[str]
 86 | 
 87 |     def __init__(self, seed: int | None = None, subset_size: int | None = None):
 88 |         self.problem_idx: int = 0
 89 |         self.seed = seed
 90 |         self.subset_size = subset_size
 91 | 
 92 |     @property
 93 |     @abstractmethod
 94 |     def problems(self) -> list[Problem]:
 95 |         pass
 96 | 
 97 |     @abstractmethod
 98 |     async def score_problem(
 99 |         self,
100 |         problem: Problem,
101 |         agent_workdir: str,
102 |         agent_answer_dir: str,
103 |         container_name: str,
104 |     ) -> tuple[float, str | None, str | None]:
105 |         """
106 |         Score the answer to the problem; the agent_workdir is an absolute path
107 |         to the mapped /home/agent/workdir in the docker container, while the
108 |         agent_answer_dir is the absolute path to the mapped logdir in the
109 |         docker container, which should contain an answer.txt file.
110 | 
111 |         To get the submitted answer (if relevant):
112 | 
113 |         answer_path = Path(agent_answer_dir) / "answer.txt"
114 |         llm_answer = answer_path.read_text().strip()
115 | 
116 |         Return the score (as a float), any parsing errors, and any additional
117 |         discussion or information about the answer that can assist the summary.
118 |         """
119 |         pass
120 | 
121 |     def get_problem(self, problem_id: str) -> Problem | None:
122 |         """Retrieve a specific problem by ID
123 |         Overload this method if there is a more efficient way of locating the
124 |         problem by problem_id.
125 |         """
126 |         return next((p for p in self.problems if p.problem_id == problem_id), None)
127 | 
128 |     async def setup_problem(
129 |         self, problem: Problem, problem_data_dir: Path, container_name: str
130 |     ) -> None:
131 |         """Optional hook for performing problem-specific setup.
132 | 
133 |         This is called before each problem is run. The problem_data_dir
134 |         will be mounted in the agent's container at /home/agent/workdir.
135 | 
136 |         Args:
137 |             problem: The problem being run
138 |             problem_data_dir: Path to a temporary directory for problem data.
139 |                 This directory will be mounted in the agent's container.
140 |             container_name: The name of the container that the problem will run in
141 |         """
142 |         pass  # Default no-op implementation
143 | 


--------------------------------------------------------------------------------
/base_agent/src/benchmarks/gsm8k.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import re
  7 | import random
  8 | import logging
  9 | 
 10 | from typing import List
 11 | from pathlib import Path
 12 | from datasets import load_dataset
 13 | from dataclasses import dataclass
 14 | 
 15 | from .base import BaseBenchmark, Problem
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | @dataclass
 21 | class GSM8KExample:
 22 |     """A single GSM8K example."""
 23 | 
 24 |     question: str
 25 |     answer: str
 26 |     steps: list[str]
 27 |     final_answer: float
 28 | 
 29 |     @classmethod
 30 |     def from_raw(cls, example: dict) -> "GSM8KExample":
 31 |         """Create a GSM8KExample from a raw dataset example."""
 32 |         # Split answer into steps and final answer
 33 |         answer_parts = example["answer"].split("####")
 34 |         steps = [s.strip() for s in answer_parts[0].split("\n") if s.strip()]
 35 |         final_answer = float(answer_parts[1].strip().replace(",", ""))
 36 | 
 37 |         return cls(
 38 |             question=example["question"].strip() + "\n\nWhen submitting your answer, please just give a single number with no accompanying text, units or other markings.",
 39 |             answer=example["answer"].strip(),
 40 |             steps=steps,
 41 |             final_answer=final_answer,
 42 |         )
 43 | 
 44 |     def extract_calculations(self) -> List[tuple[str, float, float]]:
 45 |         """Extract arithmetic calculations from the solution steps.
 46 | 
 47 |         Returns:
 48 |             List of tuples containing (expression, expected_result, actual_result)
 49 |         """
 50 |         calculations = []
 51 |         pattern = r"<<(.+?)=(.+?)>>"
 52 | 
 53 |         for step in self.steps:
 54 |             matches = re.finditer(pattern, step)
 55 |             for match in matches:
 56 |                 expr, result = match.groups()
 57 |                 try:
 58 |                     # Clean the expression and make it Python-safe
 59 |                     expr = expr.strip().replace("×", "*").replace("÷", "/")
 60 |                     actual = eval(
 61 |                         expr
 62 |                     )  # Note: eval is safe here as we control the input
 63 |                     expected = float(result)
 64 |                     calculations.append((expr, expected, actual))
 65 |                 except:
 66 |                     continue
 67 | 
 68 |         return calculations
 69 | 
 70 | 
 71 | class GSM8KBenchmark(BaseBenchmark):
 72 | 
 73 |     name = "gsm8k"
 74 | 
 75 |     def __init__(self, seed: int | None = None, subset_size: int | None = None):
 76 |         super().__init__(seed, subset_size)
 77 | 
 78 |         # Validate inputs
 79 |         if subset_size is not None and subset_size <= 0:
 80 |             raise ValueError("subset_size must be positive")
 81 | 
 82 |         dataset = load_dataset("openai/gsm8k", "main")
 83 |         # self.train_data = [GSM8KExample.from_raw(ex) for ex in dataset["train"]]
 84 |         self.test_data = [GSM8KExample.from_raw(ex) for ex in dataset["test"]]
 85 | 
 86 |         self._data = [
 87 |             Problem(problem_id=str(i), statement=p.question, answer=p.final_answer, answer_discussion="\n".join(p.steps))
 88 |             for i, p in enumerate(self.test_data)
 89 |         ]
 90 | 
 91 |         # Create randomized subset if requested
 92 |         if subset_size is not None:
 93 |             random.seed(seed)
 94 |             self._data = random.sample(self._data, subset_size)
 95 | 
 96 |     @property
 97 |     def problems(self) -> list[Problem]:
 98 |         return self._data
 99 | 
100 |     async def score_problem(
101 |         self,
102 |         problem: Problem,
103 |         agent_workdir: str,
104 |         agent_answer_dir: str,
105 |         container_name: str,
106 |     ) -> tuple[float, str | None, str | None]:
107 |         try:
108 |             answer_path = Path(agent_answer_dir) / "answer.txt"
109 |             llm_answer = answer_path.read_text().strip()
110 | 
111 |             float_answer = float(llm_answer.strip().replace(",", "").replace(" ", ""))
112 |             if abs(problem.answer - float_answer) < 1e-7:
113 |                 return 1.0, None, problem.answer_discussion
114 |             else:
115 |                 return 0.0, None, problem.answer_discussion
116 |         except Exception as e:
117 |             logger.debug(f"Error in gsm8k scoring: {e}")
118 |             return 0.0, str(e), problem.answer_discussion
119 | 


--------------------------------------------------------------------------------
/base_agent/src/benchmarks/gsm_ic.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import random
  7 | import logging
  8 | 
  9 | from pathlib import Path
 10 | from datasets import load_dataset
 11 | from dataclasses import dataclass
 12 | 
 13 | from .base import BaseBenchmark, Problem
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | @dataclass
 19 | class GSMICExample:
 20 |     """A single GSM-IC example with irrelevant context."""
 21 | 
 22 |     question: str
 23 |     answer: float
 24 |     n_steps: int
 25 | 
 26 |     @classmethod
 27 |     def from_raw(cls, example: dict) -> "GSMICExample":
 28 |         """Create a GSMICExample from a raw dataset example."""
 29 |         return cls(
 30 |             question=example["question"].strip(),
 31 |             answer=float(str(example["answer"]).strip().replace(",", "")),
 32 |             n_steps=int(example["n_steps"]),
 33 |         )
 34 | 
 35 | 
 36 | class GSMICBenchmark(BaseBenchmark):
 37 |     """Benchmark for the GSM-IC dataset that tests mathematical reasoning with irrelevant context."""
 38 | 
 39 |     name = "gsm_ic"
 40 | 
 41 |     def __init__(self, seed: int | None = None, subset_size: int | None = None):
 42 |         """Initialize the GSM-IC benchmark.
 43 | 
 44 |         Args:
 45 |             subset_size: Number of problems to use (default 50 to match GSM8K implementation)
 46 |         """
 47 |         super().__init__(seed, subset_size)
 48 | 
 49 |         # Validate inputs
 50 |         if subset_size is not None and subset_size <= 0:
 51 |             raise ValueError("subset_size must be positive")
 52 | 
 53 |         # Load the dataset
 54 |         dataset = load_dataset("voidful/GSM-IC")
 55 |         self.data = [GSMICExample.from_raw(ex) for ex in dataset["validation"]]
 56 | 
 57 |         # Create problem instances, limiting to subset_size
 58 |         self._problems = [
 59 |             Problem(problem_id=str(i), statement=p.question, answer=p.answer, answer_discussion=None)
 60 |             for i, p in enumerate(self.data)
 61 |         ]
 62 | 
 63 |         # Create randomized subset if requested
 64 |         if subset_size is not None:
 65 |             random.seed(seed)
 66 |             self._problems = random.sample(self._problems, subset_size)
 67 | 
 68 |     @property
 69 |     def problems(self) -> list[Problem]:
 70 |         """Return the list of problems."""
 71 |         return self._problems
 72 | 
 73 |     async def score_problem(
 74 |         self,
 75 |         problem: Problem,
 76 |         agent_workdir: str,
 77 |         agent_answer_dir: str,
 78 |         container_name: str,
 79 |     ) -> tuple[float, str | None, str | None]:
 80 |         """Score an answer from the LLM against the ground truth.
 81 | 
 82 |         Args:
 83 |             problem: Problem instance containing the ground truth
 84 |             llm_answer: Answer string from the LLM
 85 | 
 86 |         Returns:
 87 |             1.0 if the answer is correct, 0.0 otherwise
 88 |         """
 89 |         try:
 90 |             answer_path = Path(agent_answer_dir) / "answer.txt"
 91 |             llm_answer = answer_path.read_text().strip()
 92 | 
 93 |             # Clean and convert llm answer to float
 94 |             float_answer = float(llm_answer.strip().replace(",", "").replace(" ", ""))
 95 | 
 96 |             # Compare with small tolerance
 97 |             if abs(problem.answer - float_answer) < 1e-7:
 98 |                 return 1.0, None, None
 99 |             return 0.0, None, None
100 | 
101 |         except Exception as e:
102 |             logger.debug(f"Error in GSM-IC scoring: {e}")
103 |             return 0.0, str(e), None
104 | 


--------------------------------------------------------------------------------
/base_agent/src/callgraph/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | Call graph tracking and oversight for agent functions and tools.
 8 | 
 9 | This module provides:
10 | - Graph data structures for tracking execution
11 | - Visualization utilities
12 | """
13 | 


--------------------------------------------------------------------------------
/base_agent/src/callgraph/digraph.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Core directed graph implementation for tracking sub-agent calls.
  7 | 
  8 | Note, in this module, the terms "agent" and "function" are used interchangeably.
  9 | """
 10 | 
 11 | from typing import Dict, Set, List, Optional, Iterator
 12 | from datetime import datetime
 13 | from dataclasses import dataclass, field
 14 | 
 15 | 
 16 | @dataclass
 17 | class FunctionNode:
 18 |     """
 19 |     Represents a function execution in the call graph.
 20 | 
 21 |     This tracks the essential metadata about a function execution,
 22 |     including timing, results, and relationships to other functions.
 23 |     """
 24 | 
 25 |     # Core identity
 26 |     id: str
 27 |     name: str
 28 |     parent_id: Optional[str] = None
 29 |     children: Set[str] = field(default_factory=set)
 30 | 
 31 |     # Execution state
 32 |     started_at: Optional[datetime] = None
 33 |     completed_at: Optional[datetime] = None
 34 |     success: bool | None = None
 35 |     error: Optional[str] = None
 36 | 
 37 |     # Function-specific data
 38 |     args: Dict = field(default_factory=dict)
 39 |     result: Optional[str] = None
 40 | 
 41 |     # Metrics
 42 |     token_count: int = 0
 43 |     num_cached_tokens: int = 0
 44 |     cost: float = 0.0
 45 | 
 46 |     @property
 47 |     def duration_seconds(self) -> Optional[float]:
 48 |         """Calculate execution duration if completed."""
 49 |         if self.completed_at and self.started_at:
 50 |             return (self.completed_at - self.started_at).total_seconds()
 51 |         return None
 52 | 
 53 | 
 54 | class CallGraph:
 55 |     """
 56 |     Directed graph tracking function calls / agent calls.
 57 | 
 58 |     The graph maintains parent-child relationships between function
 59 |     calls and tracks execution metrics for each function.
 60 |     """
 61 | 
 62 |     def __init__(self):
 63 |         self.nodes: Dict[str, FunctionNode] = {}
 64 |         self._root_id: Optional[str] = None
 65 | 
 66 |     @property
 67 |     def root(self) -> Optional[FunctionNode]:
 68 |         """Get the root node if it exists."""
 69 |         return self.nodes.get(self._root_id) if self._root_id else None
 70 | 
 71 |     def add_node(self, node: FunctionNode) -> None:
 72 |         """
 73 |         Add a node to the graph.
 74 | 
 75 |         If this is the first node, it becomes the root.
 76 |         """
 77 |         self.nodes[node.id] = node
 78 |         if not self._root_id:
 79 |             self._root_id = node.id
 80 | 
 81 |     def get_node(self, node_id: str) -> Optional[FunctionNode]:
 82 |         """Get a node by ID."""
 83 |         return self.nodes.get(node_id)
 84 | 
 85 |     def add_edge(self, from_id: str, to_id: str) -> None:
 86 |         """Add a directed edge between nodes."""
 87 |         if from_id not in self.nodes or to_id not in self.nodes:
 88 |             raise ValueError("Both nodes must exist in the graph")
 89 | 
 90 |         self.nodes[from_id].children.add(to_id)
 91 |         self.nodes[to_id].parent_id = from_id
 92 | 
 93 |     def get_children(self, node_id: str) -> List[FunctionNode]:
 94 |         """Get all child nodes of a given node."""
 95 |         node = self.nodes.get(node_id)
 96 |         if not node:
 97 |             return []
 98 |         return [self.nodes[child_id] for child_id in node.children]
 99 | 
100 |     def get_ancestors(self, node_id: str) -> List[FunctionNode]:
101 |         """Get all ancestors of a node (parent, parent's parent, etc)."""
102 |         ancestors = []
103 |         current = self.nodes.get(node_id)
104 |         while current and current.parent_id:
105 |             parent = self.nodes.get(current.parent_id)
106 |             if parent:
107 |                 ancestors.append(parent)
108 |                 current = parent
109 |             else:
110 |                 break
111 |         return ancestors
112 | 
113 |     def get_subtree(self, root_id: str) -> Set[str]:
114 |         """Get all node IDs in the subtree rooted at root_id."""
115 |         subtree = {root_id}
116 |         node = self.nodes.get(root_id)
117 |         if node:
118 |             for child_id in node.children:
119 |                 subtree.update(self.get_subtree(child_id))
120 |         return subtree
121 | 
122 |     def remove_subtree(self, root_id: str) -> None:
123 |         """Remove a node and its entire subtree."""
124 |         subtree = self.get_subtree(root_id)
125 |         for node_id in subtree:
126 |             node = self.nodes.pop(node_id, None)
127 |             if node and node.parent_id:
128 |                 parent = self.nodes.get(node.parent_id)
129 |                 if parent:
130 |                     parent.children.remove(node_id)
131 | 
132 |     def iter_bfs(self) -> Iterator[FunctionNode]:
133 |         """Iterate through nodes in breadth-first order."""
134 |         if not self._root_id:
135 |             return
136 | 
137 |         visited = set()
138 |         queue = [self._root_id]
139 | 
140 |         while queue:
141 |             node_id = queue.pop(0)
142 |             if node_id not in visited:
143 |                 visited.add(node_id)
144 |                 node = self.nodes.get(node_id)
145 |                 if node:
146 |                     yield node
147 |                     queue.extend(node.children)
148 | 
149 |     def iter_dfs(self) -> Iterator[FunctionNode]:
150 |         """Iterate through nodes in depth-first order."""
151 |         if not self._root_id:
152 |             return
153 | 
154 |         visited = set()
155 | 
156 |         def dfs(node_id: str) -> Iterator[FunctionNode]:
157 |             if node_id not in visited:
158 |                 visited.add(node_id)
159 |                 node = self.nodes.get(node_id)
160 |                 if node:
161 |                     yield node
162 |                     for child_id in node.children:
163 |                         yield from dfs(child_id)
164 | 
165 |         yield from dfs(self._root_id)
166 | 
167 |     def find_cycles(self) -> List[List[str]]:
168 |         """Find any cycles in the graph."""
169 |         cycles = []
170 |         visited = set()
171 |         path = []
172 |         path_set = set()
173 | 
174 |         def dfs(node_id: str) -> None:
175 |             if node_id in path_set:
176 |                 cycle_start = path.index(node_id)
177 |                 cycles.append(path[cycle_start:] + [node_id])
178 |                 return
179 | 
180 |             if node_id in visited:
181 |                 return
182 | 
183 |             visited.add(node_id)
184 |             path.append(node_id)
185 |             path_set.add(node_id)
186 | 
187 |             node = self.nodes.get(node_id)
188 |             if node:
189 |                 for child_id in node.children:
190 |                     dfs(child_id)
191 | 
192 |             path.pop()
193 |             path_set.remove(node_id)
194 | 
195 |         if self._root_id:
196 |             dfs(self._root_id)
197 | 
198 |         return cycles
199 | 
200 |     def get_execution_metrics(self) -> Dict:
201 |         """Get overall execution metrics."""
202 |         total_tokens = sum(n.token_count for n in self.nodes.values())
203 |         num_cached_tokens = sum(n.num_cached_tokens for n in self.nodes.values())
204 |         total_cost = sum(n.cost for n in self.nodes.values())
205 | 
206 |         complete_nodes = [
207 |             n for n in self.nodes.values() if n.started_at and n.completed_at
208 |         ]
209 | 
210 |         total_duration = (
211 |             sum(
212 |                 n.duration_seconds
213 |                 for n in complete_nodes
214 |                 if n.duration_seconds is not None
215 |             )
216 |             if complete_nodes
217 |             else 0
218 |         )
219 | 
220 |         successes = sum(1 for n in self.nodes.values() if n.success)
221 |         failures = sum(1 for n in self.nodes.values() if not n.success)
222 | 
223 |         return {
224 |             "total_functions": len(self.nodes),
225 |             "total_tokens": total_tokens,
226 |             "num_cached_tokens": num_cached_tokens,
227 |             "total_cost": total_cost,
228 |             "total_duration": total_duration,
229 |             "successful_calls": successes,
230 |             "failed_calls": failures,
231 |         }
232 | 


--------------------------------------------------------------------------------
/base_agent/src/config.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | from pydantic import field_validator
 7 | from pydantic_settings import BaseSettings
 8 | 
 9 | from .types.llm_types import Model
10 | 
11 | 
12 | class Settings(BaseSettings):
13 |     # Basic Agent Configuration
14 |     NAME: str = "self_referential_agent"
15 |     LOG_LEVEL: str = "INFO"
16 | 
17 |     MODEL: Model = Model.SONNET_37
18 |     REASONING_MODEL: Model = Model.O3_MINI
19 |     OVERSIGHT_MODEL: Model = Model.SONNET_37
20 | 
21 |     @field_validator("MODEL", "REASONING_MODEL", "OVERSIGHT_MODEL", mode="before")
22 |     def parse_model(cls, value):
23 |         """Convert a string model name into a Model enum instance."""
24 |         if isinstance(value, str):
25 |             return Model.from_name(value)
26 |         elif isinstance(value, Model):
27 |             return value
28 |         raise ValueError(f"Invalid model value: {value!r}")
29 | 
30 |     model_config = {
31 |         "env_prefix": "AGENT_",
32 |         "case_sensitive": True,
33 |         "extra": "allow",  # Allow extra fields from environment
34 |     }
35 | 
36 | 
37 | settings = Settings()
38 | 


--------------------------------------------------------------------------------
/base_agent/src/events/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from .event_bus import EventBus
7 | 
8 | __all__ = ["EventBus"]
9 | 


--------------------------------------------------------------------------------
/base_agent/src/events/event_bus_utils.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Utility functions for working with the event bus.
  7 | 
  8 | Note: these are entirely un-optimised, and many require inefficient iterations
  9 | over the event lists to reconstruct 'views' on the event bus. For SWE bench
 10 | style tasks, with only up to hundreds of messages, this is not important.
 11 | """
 12 | 
 13 | from typing import Optional, Set, List
 14 | 
 15 | from .event_bus import EventBus
 16 | from ..types.tool_types import ToolResult
 17 | from ..types.agent_types import AgentResult
 18 | from ..types.event_types import EventType, Event, FileOperation, FileEvent
 19 | 
 20 | 
 21 | async def log_to_stdout(event: Event | FileEvent):
 22 |     """Print important events to stdout with clear formatting.
 23 | 
 24 |     This function is important to format since it is the visual feedback
 25 |     that the meta-agent will get back when test-running itself
 26 |     """
 27 | 
 28 |     # Common formatting constants
 29 |     max_content_len = 50  # Reduced to allow for richer metadata
 30 |     prefix_width = 10
 31 | 
 32 |     def truncate(text: str, length: int = max_content_len) -> str:
 33 |         """Helper to truncate text and handle newlines"""
 34 |         text = text.replace("\n", " ")
 35 |         return f"{text[:length]}..." if len(text) > length else text
 36 | 
 37 |     def format_output(prefix: str, content: str, metadata: str = "") -> None:
 38 |         """Helper to format and print consistent output"""
 39 |         print(
 40 |             f"{prefix:<{prefix_width}s} => {content}{' | ' + metadata if metadata else ''}"
 41 |         )
 42 | 
 43 |     event_content = truncate(str(event.content))
 44 | 
 45 |     if event.type in (EventType.CORE_PROMPT_UPDATE, EventType.SYSTEM_PROMPT_UPDATE):
 46 |         return
 47 |     elif event.type == EventType.ASSISTANT_MESSAGE:
 48 |         format_output(event.type.value, event_content)
 49 |     elif event.type == EventType.TOOL_CALL:
 50 |         name = event.metadata.get("name", "unknown tool")
 51 |         args = truncate(str(event.metadata.get("args", {})))
 52 |         format_output(event.type.value, f"{name}, {args}")
 53 |     elif event.type == EventType.TOOL_RESULT:
 54 |         result = event.metadata.get("tool_result")
 55 |         if not isinstance(result, ToolResult):
 56 |             return
 57 |         content = f"{result.tool_name}, success: {result.success}, "
 58 |         content += f"duration: {result.duration:.1f}, {event_content} "
 59 |         format_output(event.type.value, content)
 60 |     elif event.type == EventType.AGENT_CALL:
 61 |         name = event.metadata.get("name", "unknown agent")
 62 |         args = truncate(str(event.metadata.get("args", {})))
 63 |         format_output(event.type.value, f"{name}, {args}")
 64 |     elif event.type == EventType.AGENT_RESULT:
 65 |         result = event.metadata.get("agent_result")
 66 |         if not isinstance(result, AgentResult):
 67 |             return
 68 |         name = result.agent_name
 69 |         status = result.status.value
 70 |         duration = result.metrics.duration_seconds or 0.0
 71 |         cost = result.metrics.cost
 72 |         res = truncate(result.result, 20)
 73 |         content = f"{name}, status: {status}, duration: {duration:.1f}, cost: ${cost:.4f}, {res}"
 74 |         format_output(event.type.value, content)
 75 |     else:
 76 |         format_output(event.type.value, event_content)
 77 | 
 78 | 
 79 | async def get_problem_statement() -> str:
 80 |     """Get the initial problem statement."""
 81 |     event_bus = await EventBus.get_instance()
 82 |     # There should only be one, but we handle the case when it was updated somehow
 83 |     ps_events = event_bus.get_events_by_type(EventType.PROBLEM_STATEMENT)
 84 |     return "\n".join(ps.content for ps in ps_events) if len(ps_events) else ""
 85 | 
 86 | 
 87 | async def get_budget_info() -> dict[str, int | float | None]:
 88 |     """Get the initial problem statement."""
 89 |     event_bus = await EventBus.get_instance()
 90 |     # There should only be one, but we handle the case when it was updated somehow
 91 |     ps_events = event_bus.get_events_by_type(EventType.BUDGET_INFO)
 92 |     if ps_events:
 93 |         return ps_events[-1].metadata
 94 |     else:
 95 |         return dict()
 96 | 
 97 | async def get_latest_sys_prompt_event(agent_id: str | None = None) -> Optional[Event]:
 98 |     """Get the latest system prompt update event."""
 99 |     event_bus = await EventBus.get_instance()
100 |     events = (
101 |         event_bus.get_events_by_type(EventType.SYSTEM_PROMPT_UPDATE)
102 |         if not agent_id
103 |         else event_bus.get_events(agent_id)
104 |     )
105 |     system_prompts = [e for e in events if e.type == EventType.SYSTEM_PROMPT_UPDATE]
106 |     return system_prompts[-1] if system_prompts else None
107 | 
108 | 
109 | async def get_latest_core_prompt_event(agent_id: str | None = None) -> Optional[Event]:
110 |     """Get the latest core prompt update event."""
111 |     event_bus = await EventBus.get_instance()
112 |     events = (
113 |         event_bus.get_events_by_type(EventType.CORE_PROMPT_UPDATE)
114 |         if not agent_id
115 |         else event_bus.get_events(agent_id)
116 |     )
117 |     core_prompts = [e for e in events if e.type == EventType.CORE_PROMPT_UPDATE]
118 |     return core_prompts[-1] if core_prompts else None
119 | 
120 | 
121 | async def get_open_file_set(agent_id: str | None = None) -> Set[FileEvent]:
122 |     """Get the set of currently open files."""
123 |     event_bus = await EventBus.get_instance()
124 |     open_files: dict[str, FileEvent] = {}
125 |     events = (
126 |         event_bus.get_events_by_type(EventType.FILE_EVENT)
127 |         if not agent_id
128 |         else [
129 |             e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT
130 |         ]
131 |     )
132 | 
133 |     for event in events:
134 |         if isinstance(event, FileEvent):
135 |             if event.operation == FileOperation.CLOSE and event.path in open_files:
136 |                 open_files.pop(event.path)
137 |             elif event.operation == FileOperation.OPEN:
138 |                 open_files[event.path] = event
139 |     return set(open_files.values())
140 | 
141 | 
142 | async def is_file_open(file_path: str, agent_id: str | None = None) -> bool:
143 |     """Check if a specific file is open."""
144 |     open_files = await get_open_file_set(agent_id)
145 |     return any(file_event.path == file_path for file_event in open_files)
146 | 
147 | 
148 | async def get_latest_file_event(
149 |     file_path: str,
150 |     agent_id: str | None = None,
151 |     exclude_close: bool = False,
152 | ) -> Optional[FileEvent]:
153 |     """Get the most recent file event for a given path."""
154 |     event_bus = await EventBus.get_instance()
155 |     events = (
156 |         event_bus.get_events_by_type(EventType.FILE_EVENT)
157 |         if not agent_id
158 |         else [
159 |             e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT
160 |         ]
161 |     )
162 | 
163 |     file_events = [
164 |         e
165 |         for e in events
166 |         if isinstance(e, FileEvent)
167 |         and e.path == file_path
168 |         and (e.operation != FileOperation.CLOSE if exclude_close else True)
169 |     ]
170 |     return file_events[-1] if file_events else None
171 | 
172 | 
173 | async def get_file_content_size(agent_id: str | None = None) -> int:
174 |     """Calculate total size of content from file events."""
175 |     event_bus = await EventBus.get_instance()
176 |     total_size = 0
177 |     events = (
178 |         event_bus.get_events_by_type(EventType.FILE_EVENT)
179 |         if not agent_id
180 |         else [
181 |             e for e in event_bus.get_events(agent_id) if e.type == EventType.FILE_EVENT
182 |         ]
183 |     )
184 | 
185 |     for event in events:
186 |         if isinstance(event, FileEvent):
187 |             total_size += len(event.content.encode("utf-8"))
188 |     return total_size
189 | 
190 | 
191 | async def get_subagent_events(
192 |     agent_id: str,
193 |     event_types: Set[EventType] = set(EventType),
194 |     # event_types: Set[EventType] = {
195 |     #     EventType.ASSISTANT_MESSAGE,
196 |     #     EventType.TOOL_RESULT,
197 |     #     EventType.AGENT_RESULT,
198 |     #     EventType.FILE_EVENT,
199 |     #     EventType.EXTERNAL_MESSAGE,
200 |     # },
201 | ) -> List[Event]:
202 |     """Get events for prefilling assistant messages."""
203 |     event_bus = await EventBus.get_instance()
204 |     all_events = event_bus.get_events_in_chain(agent_id)
205 |     return [e for e in all_events if e.type in event_types]
206 | 


--------------------------------------------------------------------------------
/base_agent/src/llm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """LLM integration module for the Self-Referential Agent System.
 7 | 
 8 | This module provides a unified interface for interacting with various LLM providers
 9 | including Anthropic, OpenAI, and DeepSeek.
10 | """
11 | 
12 | import logging
13 | 
14 | from .base import (
15 |     Message,
16 |     Completion,
17 |     CompletionChunk,
18 |     TimingInfo,
19 |     TextContent,
20 |     ToolResultContent,
21 | )
22 | from .api import create_completion, create_streaming_completion
23 | from .metering import token_meter, get_total_cost
24 | 
25 | # Quieten LLM API call logs to make stdout more useful
26 | logging.getLogger("httpx").setLevel(logging.WARNING)
27 | 
28 | __all__ = [
29 |     "Message",
30 |     "Completion",
31 |     "CompletionChunk",
32 |     "TimingInfo",
33 |     "create_completion",
34 |     "create_streaming_completion",
35 |     "token_meter",
36 |     "get_total_cost",
37 |     "TextContent",
38 |     "ToolResultContent",
39 | ]
40 | 


--------------------------------------------------------------------------------
/base_agent/src/llm/base.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Base models and shared functionality for LLM interactions."""
  7 | 
  8 | from typing import Dict, Optional
  9 | from datetime import datetime, timedelta
 10 | from pydantic import BaseModel, Field
 11 | 
 12 | from ..types.llm_types import TokenUsage, Model, StopReason, TextContent, ReasoningContent, ToolCallContent, ToolResultContent, ContentTypes
 13 | 
 14 | # NOTE: perhaps move the rest of these classes to the llm_types for consistency
 15 | 
 16 | 
 17 | class Message(BaseModel):
 18 |     """A message in a conversation with an LLM."""
 19 | 
 20 |     role: str
 21 |     content: list[ContentTypes]
 22 |     name: Optional[str] = None
 23 | 
 24 |     def __str__(self) -> str:
 25 |         parts = [f"Message from role={self.role}"]
 26 |         for c in self.content:
 27 |             if isinstance(c, TextContent):
 28 |                 parts.append(f"Text {'-'*10}\n{c.text}")
 29 |             elif isinstance(c, ReasoningContent):
 30 |                 parts.append(f"Reasoning {'-'*10}\n{c.text}")
 31 |             elif isinstance(c, ToolCallContent):
 32 |                 parts.append(f"{'-'*10}\nTool call {c.tool_name} (id: {c.call_id}) {c.call_type}: {str(c.tool_args)}\n{'-'*10}")
 33 |             elif isinstance(c, ToolResultContent):
 34 |                 parts.append(f"{'-'*10}\nTool result {c.tool_name} (id: {c.call_id}): {c.content}\n{'-'*10}")
 35 |         # return "\n".join([p.replace("\n", "").strip() for p in parts])
 36 |         return "\n".join(parts)
 37 | 
 38 | 
 39 | class TimingInfo(BaseModel):
 40 |     """Timing information for LLM interactions."""
 41 | 
 42 |     start_time: datetime = Field(description="When the request started")
 43 |     end_time: datetime = Field(description="When the response completed")
 44 |     total_duration: timedelta = Field(description="Total duration of the request")
 45 |     first_token_time: Optional[datetime] = Field(
 46 |         None, description="When the first token was received"
 47 |     )
 48 |     time_to_first_token: Optional[float] = Field(
 49 |         None, description="Duration until first token received"
 50 |     )
 51 |     tokens_per_second: Optional[float] = Field(
 52 |         None, description="Average tokens per second for completion"
 53 |     )
 54 | 
 55 |     def __str__(self) -> str:
 56 |         # Format datetime fields to a readable format
 57 |         fmt = "%Y-%m-%d %H:%M:%S"
 58 |         parts = [
 59 |             f"- Start {self.start_time.strftime(fmt)}, End {self.end_time.strftime(fmt)}",
 60 |             f"- Duration: {self.total_duration}",
 61 |         ]
 62 |         if self.time_to_first_token is not None:
 63 |             parts.append(f"- TTFT: {self.time_to_first_token:.2f} sec")
 64 |         if self.tokens_per_second is not None:
 65 |             parts.append(f"- TPS: {self.tokens_per_second:.2f}")
 66 |         return "\n".join(parts)
 67 | 
 68 | class CacheMetrics(BaseModel):
 69 |     """Cache-related metrics."""
 70 | 
 71 |     cache_hits: int = Field(default=0, description="Number of cache hits")
 72 |     cache_misses: int = Field(default=0, description="Number of cache misses")
 73 |     cache_writes: int = Field(default=0, description="Number of cache writes")
 74 | 
 75 |     @classmethod
 76 |     def from_dict(cls, data: Optional[Dict[str, int]] = None) -> "CacheMetrics":
 77 |         """Create metrics from dictionary, preserving provider values."""
 78 |         if data is None:
 79 |             data = {"cache_hits": 0, "cache_misses": 0, "cache_writes": 0}
 80 |         return cls(
 81 |             cache_hits=data.get("cache_hits", 0),
 82 |             cache_misses=data.get("cache_misses", 0),
 83 |             cache_writes=data.get("cache_writes", 0),
 84 |         )
 85 | 
 86 |     def to_dict(self) -> Dict[str, int]:
 87 |         """Convert to dictionary."""
 88 |         return self.model_dump()  # Use model_dump instead of dict
 89 | 
 90 | 
 91 | # Completion Types ============================================================
 92 | 
 93 | class Completion(BaseModel):
 94 |     """A completion response from an LLM."""
 95 | 
 96 |     id: str
 97 |     content: list[ContentTypes] | list[list[ContentTypes]]
 98 |     model: Model  # Model identifier string
 99 |     usage: TokenUsage
100 |     timing: TimingInfo
101 |     cache_metrics: Optional[Dict[str, int]] = None
102 |     stop_reason: StopReason | list[StopReason] = StopReason.COMPLETE
103 |     stop_sequence: Optional[str] | list[StopReason] = None
104 |     continuation_count: Optional[int] = None
105 |     raw_response: Optional[Dict] = Field(default=None, exclude=True)
106 | 
107 |     @property
108 |     def finished_early(self) -> bool:
109 |         """Check if completion stopped before finishing normally."""
110 |         return self.stop_reason != StopReason.COMPLETE
111 | 
112 |     @property
113 |     def hit_token_limit(self) -> bool:
114 |         """Check if completion stopped due to token length."""
115 |         return self.stop_reason == StopReason.LENGTH
116 | 
117 |     @property
118 |     def errored(self) -> bool:
119 |         """Check if completion encountered an error."""
120 |         return self.stop_reason == StopReason.ERROR
121 | 
122 |     def get_cache_metric(self, key: str, default: int = 0) -> int:
123 |         """Get a cache metric value safely."""
124 |         if self.cache_metrics is None:
125 |             return default
126 |         return self.cache_metrics.get(key, default)
127 | 
128 |     def calculate_cost(self) -> float:
129 |         """Calculate the cost for this completion."""
130 |         return self.usage.calculate_cost(self.model.token_cost)
131 | 
132 |     def __str__(self) -> str:
133 |         comp_str = f"{'='*80}\n"
134 |         if isinstance(self.content[0], list):
135 |             for i, completion in enumerate(self.content):
136 |                 comp_str += f"Candidate {i:03d} {70*'-'}\n"
137 |                 for block in completion:
138 |                     comp_str += str(block) + "\n"
139 |         else:
140 |             for block in self.content:
141 |                 comp_str += str(block) + "\n"
142 |         comp_str += f"\n{'-'*80}\n"
143 |         comp_str += f"Model: {self.model.id}\n"
144 |         comp_str += f"""Tokens used:
145 | - Input {self.usage.input_tokens} (cached: {self.usage.cached_prompt_tokens}, written to cache: {self.usage.cache_write_prompt_tokens})
146 | - Completion {self.usage.completion_tokens}
147 | - Total {self.usage.total_tokens}
148 | """
149 |         if self.stop_reason != StopReason.COMPLETE:
150 |             comp_str += f"Stop reason: {self.stop_reason}\n"
151 |             if self.stop_sequence:
152 |                 comp_str += f"Stop sequence: {self.stop_sequence}\n"
153 | 
154 |         if self.continuation_count:
155 |             comp_str += f"Continuations: {self.continuation_count}\n"
156 | 
157 |         if self.timing:
158 |             comp_str += f"Timing:\n{self.timing}\n"
159 | 
160 |         comp_str += f"Cost: ${self.calculate_cost():.6f}\n"
161 | 
162 |         comp_str += f"{'='*80}\n"
163 |         return comp_str
164 | 
165 | 
166 | class CompletionChunk(BaseModel):
167 |     """A streaming chunk of a completion response."""
168 | 
169 |     id: str
170 |     content: str  # TODO: make tool call or assistant message string
171 |     model: Model  # Model identifier string
172 |     is_finished: bool = False
173 |     timing: Optional[TimingInfo] = None
174 |     usage: Optional[TokenUsage] = None
175 |     cache_metrics: Optional[Dict[str, int]] = None
176 |     stop_reason: Optional[StopReason] = None
177 |     continuation_count: Optional[int] = None
178 |     raw_response: Optional[Dict] = Field(default=None, exclude=True)
179 | 
180 |     @property
181 |     def finished_early(self) -> bool:
182 |         """Check if completion stopped before finishing normally."""
183 |         return bool(self.stop_reason and self.stop_reason != StopReason.COMPLETE)
184 | 
185 |     @property
186 |     def hit_token_limit(self) -> bool:
187 |         """Check if completion stopped due to token length."""
188 |         return bool(self.stop_reason and self.stop_reason == StopReason.LENGTH)
189 | 
190 |     @property
191 |     def errored(self) -> bool:
192 |         """Check if completion encountered an error."""
193 |         return bool(self.stop_reason and self.stop_reason == StopReason.ERROR)
194 | 
195 |     def get_cache_metric(self, key: str, default: int = 0) -> int:
196 |         """Get a cache metric value safely."""
197 |         if self.cache_metrics is None:
198 |             return default
199 |         return self.cache_metrics.get(key, default)
200 | 
201 |     def model_dump(self, **kwargs) -> Dict:
202 |         """Override model_dump to exclude raw_response by default."""
203 |         kwargs.setdefault("exclude", {"raw_response"})
204 |         return super().model_dump(**kwargs)
205 | 


--------------------------------------------------------------------------------
/base_agent/src/llm/metering.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | from typing import DefaultDict
 7 | from collections import defaultdict
 8 | 
 9 | from ..types.llm_types import TokenUsage, Model
10 | 
11 | # A mapping from models to token usage and dollar cost
12 | token_meter: DefaultDict[Model, TokenUsage] = defaultdict(TokenUsage)
13 | budget_info: dict[str, None | int | float] = dict(
14 |     start_time=None,  # start timestamp
15 |     cost_budget=None,  # cost budget in USD
16 |     time_budget=None,  # time budget in seconds
17 | )
18 | 
19 | 
20 | def get_total_cost() -> float:
21 |     total = 0.0
22 |     for model in Model:
23 |         total += token_meter[model].calculate_cost(model.token_cost)
24 |     return total
25 | 
26 | 
27 | def get_total_usage() -> TokenUsage:
28 |     usage = TokenUsage()
29 |     for model in Model:
30 |         usage += token_meter[model]
31 |     return usage
32 | 
33 | 
34 | class CallCounter:
35 |     def __init__(self):
36 |         self.count = 0
37 | 
38 |     def count_new_call(self):
39 |         self.count += 1
40 | 
41 |     def get_count(self) -> int:
42 |         return self.count
43 | 
44 | 
45 | llm_call_counter = CallCounter()
46 | 


--------------------------------------------------------------------------------
/base_agent/src/llm/providers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """Provider-specific implementations for different LLM services."""
 7 | 
 8 | from .anthropic import AnthropicProvider
 9 | from .openai import OpenAIProvider
10 | from .deepseek import DeepSeekProvider
11 | from .fireworks import FireworksProvider
12 | 
13 | from .google import GoogleProvider
14 | from .google_rest import GoogleRESTProvider
15 | from .google_oai import GoogleOAIProvider
16 | from .vertex import VertexProvider
17 | 
18 | __all__ = [
19 |     "AnthropicProvider",
20 |     "OpenAIProvider",
21 |     "DeepSeekProvider",
22 |     "FireworksProvider",
23 |     "GoogleProvider",
24 |     "GoogleRESTProvider",
25 |     "GoogleOAIProvider",
26 |     "VertexProvider",
27 | ]
28 | 


--------------------------------------------------------------------------------
/base_agent/src/schemas/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | from .representation import (
 7 |     get_schema_representation,
 8 |     ArgFormat,
 9 |     dumps,
10 | )
11 | from .xml_dumps import xml_dumps
12 | from .xml_parsing import xml_str_to_dict
13 | from .json_parsing import json_str_to_dict
14 | 
15 | 
16 | from typing import Type
17 | from pydantic import BaseModel
18 | 
19 | 
20 | async def args_str_to_dict(
21 |     tool_args: str, guide_obj: Type[BaseModel], arg_format: ArgFormat, root_tag: str
22 | ) -> tuple[dict | None, str | None]:
23 | 
24 |     # Get schema representation for LLM fixing
25 |     if arg_format == ArgFormat.JSON:
26 |         return await json_str_to_dict(tool_args, guide_obj)
27 |     else:
28 |         tool_args = f"<{root_tag}>\n{tool_args}\n</{root_tag}>"
29 |         return await xml_str_to_dict(tool_args, guide_obj, root_tag=root_tag)
30 | 


--------------------------------------------------------------------------------
/base_agent/src/schemas/representation.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | Utilities for consistent representation of Pydantic models.
  8 | Provides both JSON and XML based formats optimized for LLM readability.
  9 | """
 10 | 
 11 | import json
 12 | import logging
 13 | 
 14 | from enum import Enum
 15 | from typing import Type, Dict, Any, Union, get_args, get_origin, Literal, List
 16 | from pydantic import BaseModel
 17 | 
 18 | from .xml_dumps import xml_dumps
 19 | from ..types.common import ArgFormat
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def get_type_info(field: Any) -> str:
 25 |     """
 26 |     Get human-readable type info for a field.
 27 |     Handles both Pydantic fields and schema properties.
 28 |     """
 29 |     field_type = field.annotation
 30 |     parts = []
 31 | 
 32 |     # Handle Optional types first
 33 |     is_optional = False
 34 |     if get_origin(field_type) is Union and type(None) in get_args(field_type):
 35 |         is_optional = True
 36 |         parts.append("optional")
 37 |         field_type = next(t for t in get_args(field_type) if t is not type(None))
 38 | 
 39 |     # Handle Literal types
 40 |     if get_origin(field_type) is Literal:
 41 |         literal_values = get_args(field_type)
 42 |         options = ", ".join(f"'{val}'" for val in literal_values)
 43 |         parts.append(f"one of [{options}]")
 44 |     elif isinstance(field_type, type) and issubclass(field_type, Enum):
 45 |         options = ", ".join(f"'{item.name}'" for item in field_type)
 46 |         parts.append(f"one of [{options}]")
 47 |     else:
 48 |         # Get base type
 49 |         if get_origin(field_type) is list:
 50 |             item_type = get_args(field_type)[0]
 51 |             type_str = f"list of {_get_base_type(item_type)}"
 52 |         elif get_origin(field_type) is dict:
 53 |             key_type, val_type = get_args(field_type)
 54 |             type_str = (
 55 |                 f"dict of {_get_base_type(key_type)} to {_get_base_type(val_type)}"
 56 |             )
 57 |         else:
 58 |             type_str = _get_base_type(field_type)
 59 |         parts.append(type_str)
 60 | 
 61 |     # Add constraints
 62 |     constraints = _get_field_constraints(field)
 63 |     if constraints:
 64 |         parts.extend(constraints)
 65 | 
 66 |     # Handle special cases first
 67 |     is_required = not is_optional and field.is_required()
 68 | 
 69 |     if (
 70 |         hasattr(field.default, "__class__")
 71 |         and field.default.__class__.__name__ == "PydanticUndefinedType"
 72 |     ) or field.default is Ellipsis:
 73 |         # Required field (PydanticUndefined or Ellipsis)
 74 |         parts.append("required")
 75 |     elif field.default_factory is not None:
 76 |         # Show empty container for defaults from factory functions
 77 |         if field_type == Dict or get_origin(field_type) is dict:
 78 |             parts.append("default: {}")
 79 |         elif field_type == List or get_origin(field_type) is list:
 80 |             parts.append("default: []")
 81 |     elif field.default is not None:
 82 |         # Explicit default value
 83 |         parts.append(f"default: {_format_default(field.default)}")
 84 |     elif is_optional:
 85 |         # Optional field without default
 86 |         parts.append("default: null")
 87 | 
 88 |     # Add description
 89 |     if field.description:
 90 |         parts.append(field.description)
 91 | 
 92 |     return ", ".join(parts)
 93 | 
 94 | 
 95 | def _get_base_type(field_type: Type) -> str:
 96 |     """Map Python types to schema types."""
 97 |     type_map = {
 98 |         str: "string",
 99 |         int: "integer",
100 |         float: "float",
101 |         bool: "boolean",
102 |         Any: "any",
103 |     }
104 |     # For custom classes, use class name
105 |     if isinstance(field_type, type):
106 |         if issubclass(field_type, BaseModel):
107 |             return field_type.__name__.lower()
108 |         elif issubclass(field_type, Enum):
109 |             return "enum"
110 |     return type_map.get(field_type, str(field_type))
111 | 
112 | 
113 | def _get_field_constraints(field: Any) -> list[str]:
114 |     """Extract field constraints as readable strings."""
115 |     constraints = []
116 |     metadata = field.metadata if hasattr(field, "metadata") else []
117 | 
118 |     constraint_names = {
119 |         "Gt": ("gt", "greater than"),
120 |         "Ge": ("ge", "min"),
121 |         "Lt": ("lt", "less than"),
122 |         "Le": ("le", "max"),
123 |         "MinLength": ("min_length", "min length"),
124 |         "MaxLength": ("max_length", "max length"),
125 |         "MinItems": ("min_items", "min items"),
126 |         "MaxItems": ("max_items", "max items"),
127 |     }
128 | 
129 |     for item in metadata:
130 |         item_type = type(item).__name__
131 |         if item_type in constraint_names:
132 |             attr_name, label = constraint_names[item_type]
133 |             value = getattr(item, attr_name)
134 |             if value is not None:
135 |                 constraints.append(f"{label}: {value}")
136 | 
137 |     return constraints
138 | 
139 | 
140 | def _format_default(value: Any) -> str:
141 |     """Format default values consistently."""
142 |     # Check for PydanticUndefined (required field with no default)
143 |     if (
144 |         hasattr(value, "__class__")
145 |         and value.__class__.__name__ == "PydanticUndefinedType"
146 |     ):
147 |         return "required"
148 | 
149 |     if isinstance(value, str):
150 |         return f"'{value}'"
151 |     elif isinstance(value, (list, dict)):
152 |         return json.dumps(value)
153 |     elif isinstance(value, Enum):
154 |         return f"'{value.name}'"
155 |     elif callable(value):  # e.g., default_factory
156 |         return "{}"  # Show empty dict/list for factory defaults
157 |     elif value is Ellipsis:
158 |         return "required"  # Explicit handling of Ellipsis
159 |     elif value is None:
160 |         return "null"
161 | 
162 |     return str(value).lower() if isinstance(value, bool) else str(value)
163 | 
164 | 
165 | def get_json_schema_representation(model: Type[BaseModel]) -> str:
166 |     """
167 |     Generate a JSON schema representation focused on LLM readability.
168 |     """
169 |     fields = model.model_fields
170 |     output = []
171 | 
172 |     for field_name, field in fields.items():
173 |         type_info = get_type_info(field)
174 |         output.append(f'"{field_name}": {type_info}')
175 | 
176 |     return "{\n    " + ",\n    ".join(output) + "\n}"
177 | 
178 | 
179 | def get_xml_schema_representation(
180 |     model: Type[BaseModel], root_tag: str | None = None
181 | ) -> str:
182 |     """
183 |     Generate an XML schema representation focused on LLM readability.
184 |     """
185 |     fields = model.model_fields
186 |     # Add angle brackets for root tag if necessary
187 |     output = [f"<{root_tag}>"] if root_tag else []
188 | 
189 |     for field_name, field in fields.items():
190 |         info = get_type_info(field)
191 |         output.append(f"<{field_name}>{info}</{field_name}>")
192 | 
193 |     if root_tag:
194 |         output.append(f"</{root_tag}>")
195 |     return "\n".join(output)
196 | 
197 | 
198 | def get_schema_representation(
199 |     cls: Type[BaseModel], arg_format: ArgFormat, root_tag: str | None = None
200 | ) -> str:
201 |     if arg_format == ArgFormat.JSON:
202 |         return get_json_schema_representation(cls)
203 |     else:
204 |         return get_xml_schema_representation(cls, root_tag=root_tag)
205 | 
206 | 
207 | def dumps(
208 |     instance: dict, format: ArgFormat, indent: int, root_tag: str | None = None
209 | ) -> str:
210 |     if format == ArgFormat.JSON:
211 |         return json.dumps(instance, indent=indent)
212 |     else:
213 |         return xml_dumps(instance, root_tag=root_tag, indent=indent)
214 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | A module of Agent tools
 8 | """
 9 | 
10 | from .base_tool import BaseTool, tool_registry
11 | from .file_tools import CloseFile, OpenFile
12 | from .edit_tools import OverwriteFile
13 | from .execute_command import ExecuteCommand
14 | from .directory_tools import ViewDirectory
15 | from .ripgrep_tool import RipGrepTool
16 | 
17 | # TODO: expand the concept of toolkits and use throughout the agent implementations
18 | toolkits: dict[str, list[BaseTool]] = dict(
19 |     coding=[
20 |         ViewDirectory,
21 |         ExecuteCommand,
22 |         OpenFile,
23 |         CloseFile,
24 |         OverwriteFile,
25 |         RipGrepTool,
26 |     ]
27 | )
28 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/answer_submission.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import logging
  7 | 
  8 | from pydantic import Field
  9 | 
 10 | from .base_tool import BaseTool
 11 | from ..schemas import args_str_to_dict
 12 | from ..types.tool_types import ToolResult
 13 | from ..types.agent_types import AgentInterface
 14 | from ..types.common import ArgFormat
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | logger.setLevel(logging.INFO)
 18 | 
 19 | 
 20 | class SubmitAnswer(BaseTool):
 21 |     """Tool for submitting answers to benchmark questions on disk.
 22 | 
 23 |     This is slightly different to the ReturnResult tool which is used to return
 24 |     a result from the end of agent function call.
 25 |     """
 26 | 
 27 |     TOOL_NAME = "submit_answer"
 28 |     TOOL_DESCRIPTION = """Submit an answer to a benchmark question. The answer should be clear and concise.
 29 | The tool will attempt to parse your answer according to the benchmark's requirements.
 30 | Your answer should be a complete response that directly addresses the question.
 31 | It is very important that you do not include any extraneous words or content in the answer field that may make the parsing fail.
 32 | """
 33 | 
 34 |     # reasoning: str = Field(
 35 |     #     ...,
 36 |     #     description="Reason about the answer you are going to submit and the correct format in which to do so",
 37 |     # )
 38 | 
 39 |     answer: str = Field(
 40 |         ..., description="Your complete answer to the benchmark question", min_length=1
 41 |     )
 42 | 
 43 |     def __init__(self, calling_agent: AgentInterface, **data):
 44 |         super().__init__(calling_agent=calling_agent, **data)
 45 | 
 46 |     async def run(self) -> ToolResult:
 47 |         """Execute the answer submission with parsing."""
 48 |         try:
 49 |             if not self._calling_agent._logdir:
 50 |                 return ToolResult(
 51 |                     tool_name=self.TOOL_NAME,
 52 |                     success=False,
 53 |                     errors="System error: no answer path available",
 54 |                 )
 55 | 
 56 |             # Validate answer is not empty or just whitespace
 57 |             answer = self.answer.strip()
 58 |             if not answer:
 59 |                 return ToolResult(
 60 |                     tool_name=self.TOOL_NAME,
 61 |                     success=False,
 62 |                     errors="Answer cannot be empty",
 63 |                 )
 64 | 
 65 |             # Save answer to disk
 66 |             path = self._calling_agent._logdir / "answer.txt"
 67 |             with open(path, "w") as f:
 68 |                 f.write(answer)
 69 | 
 70 |             return ToolResult(tool_name=self.TOOL_NAME, success=True)
 71 | 
 72 |         except Exception as e:
 73 |             return ToolResult(
 74 |                 tool_name=self.TOOL_NAME,
 75 |                 success=False,
 76 |                 errors=f"Failed to save answer: {str(e)}",
 77 |             )
 78 | 
 79 |     @classmethod
 80 |     async def args_str_to_dict(
 81 |         cls, args_str: str, arg_format: ArgFormat = ArgFormat.XML
 82 |     ) -> tuple[dict | None, str | None]:
 83 |         args_dict, parse_warnings = await args_str_to_dict(
 84 |             args_str, guide_obj=cls, arg_format=arg_format, root_tag="TOOL_ARGS"
 85 |         )
 86 |         if args_dict:
 87 |             args_dict["answer"] = str(args_dict["answer"])
 88 |         return args_dict, parse_warnings
 89 | 
 90 |     @classmethod
 91 |     def generate_examples(cls) -> list[tuple["SubmitAnswer", ToolResult]]:
 92 |         """Generate example uses of the submit_answer tool."""
 93 |         from ..agents.implementations import DemoAgent
 94 | 
 95 |         return [
 96 |             (
 97 |                 cls(
 98 |                     calling_agent=DemoAgent(),
 99 |                     answer="5",
100 |                 ),
101 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True),
102 |             ),
103 |             (
104 |                 cls(
105 |                     calling_agent=DemoAgent(),
106 |                     # reasoning="The speed of the car is 10mph",
107 |                     answer="10 miles per hour",
108 |                 ),
109 |                 ToolResult(
110 |                     tool_name=cls.TOOL_NAME, success=False, errors="Parser error"
111 |                 ),
112 |             ),
113 |             (
114 |                 cls(
115 |                     calling_agent=DemoAgent(),
116 |                     # reasoning="The calculated value is 1,234.5",
117 |                     answer="1,234.5",
118 |                 ),
119 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True),
120 |             ),
121 |         ]
122 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/calculator.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | import logging
 7 | 
 8 | from pydantic import Field
 9 | 
10 | from .base_tool import BaseTool
11 | from ..types.tool_types import ToolResult
12 | from ..types.agent_types import AgentInterface
13 | 
14 | logger = logging.getLogger(__name__)
15 | logger.setLevel(logging.INFO)
16 | 
17 | 
18 | class Calculator(BaseTool):
19 |     TOOL_NAME = "calculate"
20 |     TOOL_DESCRIPTION = """A calculator tool that evaluates mathematical expressions.
21 | Supports basic arithmetic operations (including +, -, *, / and ^) and parentheses.
22 | All expressions must contain only numbers and valid operators."""
23 | 
24 |     reasoning: str = Field(
25 |         ..., description="Concise resoning about the operation to be performed"
26 |     )
27 |     expression: str = Field(
28 |         ...,
29 |         description="Mathematical expression to evaluate",
30 |         pattern=r"^[\d\s\+\-\*\/\(\)\.]+$",
31 |     )
32 | 
33 |     def __init__(self, calling_agent: AgentInterface, **data):
34 |         super().__init__(calling_agent=calling_agent, **data)
35 | 
36 |     async def run(self) -> ToolResult:
37 |         try:
38 |             result = eval(self.expression)
39 |             return ToolResult(
40 |                 tool_name=self.TOOL_NAME, success=True, output=str(result)
41 |             )
42 |         except Exception as e:
43 |             return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
44 | 
45 |     @classmethod
46 |     def generate_examples(cls) -> list[tuple["Calculator", ToolResult]]:
47 |         from ..agents.implementations import DemoAgent
48 | 
49 |         return [
50 |             (
51 |                 cls(
52 |                     calling_agent=DemoAgent(),
53 |                     reasoning="The number of fruit is the sum of the two apples and three oranges",
54 |                     expression="2 + 3",
55 |                 ),
56 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True, output=str(5)),
57 |             ),
58 |             (
59 |                 cls(
60 |                     calling_agent=DemoAgent(),
61 |                     reasoning="The compound expression will require parentheses",
62 |                     expression="(3 * 4) / 2",
63 |                 ),
64 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True, output=str(6)),
65 |             ),
66 |         ]
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     import asyncio
71 |     from ..agents.implementations import DemoAgent
72 | 
73 |     async def test():
74 |         c = Calculator(calling_agent=DemoAgent(), reasoning="...", expression="2+2")
75 |         result = await c.run()
76 | 
77 |         assert result.tool_name == Calculator.TOOL_NAME
78 |         assert result.success
79 |         assert result.duration < 0.5
80 |         assert result.output == str(4)
81 |         print("All tests pass!")
82 | 
83 |     asyncio.run(test())
84 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/directory_tools.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import logging
  7 | 
  8 | from pathlib import Path
  9 | from pydantic import Field
 10 | 
 11 | from .base_tool import BaseTool
 12 | from ..utils.file_views import create_filetree, FileTreeOptions
 13 | from ..types.tool_types import ToolResult
 14 | from ..types.agent_types import AgentInterface
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | logger.setLevel(logging.INFO)
 18 | 
 19 | 
 20 | class ViewDirectory(BaseTool):
 21 |     """Tool to generate a detailed view of directory contents."""
 22 | 
 23 |     TOOL_NAME = "view_directory"
 24 |     TOOL_DESCRIPTION = """View the contents of a directory with configurable depth and detail options.
 25 | 
 26 | The tool provides a formatted tree view of the directory structure, including:
 27 | - File and directory sizes
 28 | - Permissions
 29 | - Modification times
 30 | - Smart collapsing of large directories
 31 | - Configurable depth and detail level"""
 32 | 
 33 |     directory: str = Field(
 34 |         ...,
 35 |         description="The directory path to view",
 36 |     )
 37 |     max_depth: int = Field(
 38 |         default=2,
 39 |         description="Maximum depth to traverse (None for unlimited)",
 40 |     )
 41 |     show_hidden: bool = Field(
 42 |         default=False,
 43 |         description="Whether to show hidden files and directories",
 44 |     )
 45 |     collapse_threshold: int = Field(
 46 |         default=15,
 47 |         description="Number of items before a directory is collapsed (None for no collapsing)",
 48 |     )
 49 |     show_timestamps: bool = Field(
 50 |         default=False,
 51 |         description="Whether to show file modification timestamps",
 52 |     )
 53 |     exclude_patterns: list[str] = Field(
 54 |         default=[],
 55 |         description="List of glob patterns to exclude (e.g. '.git' or '*.pyc')",
 56 |     )
 57 |     show_full_filepaths: bool = Field(
 58 |         default=False,
 59 |         description="Whether to show the full filepaths from the root directory",
 60 |     )
 61 | 
 62 |     def __init__(self, calling_agent: AgentInterface, **data):
 63 |         super().__init__(calling_agent=calling_agent, **data)
 64 | 
 65 |     async def run(self) -> ToolResult:
 66 |         try:
 67 |             path = Path(self.directory)
 68 |             if not path.exists():
 69 |                 return ToolResult(
 70 |                     tool_name=self.TOOL_NAME,
 71 |                     success=False,
 72 |                     errors=f"Directory does not exist: {path}",
 73 |                 )
 74 |             if not path.is_dir():
 75 |                 return ToolResult(
 76 |                     tool_name=self.TOOL_NAME,
 77 |                     success=False,
 78 |                     errors=f"Path is not a directory: {path}",
 79 |                 )
 80 | 
 81 |             # Create options for the tree generation
 82 |             options = FileTreeOptions(
 83 |                 collapse_threshold=self.collapse_threshold,
 84 |                 show_hidden=self.show_hidden,
 85 |                 exclude_patterns=(
 86 |                     self.exclude_patterns
 87 |                     if len(self.exclude_patterns) > 0 or self.show_hidden
 88 |                     else None
 89 |                 ),
 90 |                 show_mtime=self.show_timestamps,
 91 |                 min_dir_level=(
 92 |                     0 if self.max_depth is None else max(0, self.max_depth - 1)
 93 |                 ),
 94 |                 show_full_path=self.show_full_filepaths,
 95 |             )
 96 | 
 97 |             # Generate the tree
 98 |             tree_output = create_filetree(path, options)
 99 | 
100 |             return ToolResult(
101 |                 tool_name=self.TOOL_NAME,
102 |                 success=True,
103 |                 output=f"Directory contents of {path}:\n{tree_output}",
104 |             )
105 | 
106 |         except Exception as e:
107 |             return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
108 | 
109 |     @classmethod
110 |     def generate_examples(cls) -> list[tuple["ViewDirectory", ToolResult]]:
111 |         from ..agents.implementations import DemoAgent
112 | 
113 |         return [
114 |             # Basic directory view
115 |             (
116 |                 cls(
117 |                     calling_agent=DemoAgent(),
118 |                     directory="/home/agent/workdir",
119 |                     max_depth=2,
120 |                     show_hidden=False,
121 |                     show_timestamps=False,
122 |                     exclude_patterns=[],
123 |                     collapse_threshold=20,
124 |                 ),
125 |                 ToolResult(
126 |                     tool_name=cls.TOOL_NAME,
127 |                     success=True,
128 |                     output="Directory contents of /home/agent/workdir:\n"
129 |                     "workdir/ [0755] (1.2MB, 25 files, 5 dirs)\n"
130 |                     "  src/ [0755] (800KB, 15 files, 3 dirs)\n"
131 |                     "    main.py [0644] 50KB\n"
132 |                     "    utils.py [0644] 30KB\n"
133 |                     "  tests/ [0755] (400KB, 10 files, 2 dirs) [collapsed]\n",
134 |                 ),
135 |             ),
136 |             # Detailed view with timestamps
137 |             (
138 |                 cls(
139 |                     calling_agent=DemoAgent(),
140 |                     directory="/home/agent/project",
141 |                     max_depth=1,
142 |                     show_hidden=True,
143 |                     show_timestamps=True,
144 |                     exclude_patterns=[".git", "*.pyc"],
145 |                     collapse_threshold=15,
146 |                 ),
147 |                 ToolResult(
148 |                     tool_name=cls.TOOL_NAME,
149 |                     success=True,
150 |                     output="Directory contents of /home/agent/project:\n"
151 |                     "project/ [0755] (2.5MB, 40 files, 8 dirs) 2024-01-14 10:00\n"
152 |                     "  .env [0644] 2KB 2024-01-14 09:55\n"
153 |                     "  README.md [0644] 15KB 2024-01-14 09:50\n"
154 |                     "  src/ [0755] (1.5MB, 25 files, 5 dirs) 2024-01-14 10:00\n"
155 |                     "  tests/ [0755] (1MB, 15 files, 3 dirs) 2024-01-14 09:45\n",
156 |                 ),
157 |             ),
158 |         ]
159 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/edit_tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Self-Improving Coding Agent
2 | # Copyright (c) 2025 Maxime Robeyns
3 | #
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from .overwrite_file import OverwriteFile
7 | 
8 | __all__ = ["OverwriteFile"]
9 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/edit_tools/overwrite_file.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import re
  7 | import logging
  8 | 
  9 | from pathlib import Path
 10 | from pydantic import Field
 11 | 
 12 | from .utils import edit_preflight_check, generate_edit_event_content
 13 | from ...schemas.json_parsing import json_str_to_dict
 14 | from ..base_tool import BaseTool, extract_between_patterns
 15 | from ...events import EventBus
 16 | from ...types.tool_types import ToolResult
 17 | from ...types.event_types import EventType, FileOperation, FileEvent
 18 | from ...types.agent_types import AgentInterface
 19 | from ...types.common import ArgFormat
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | logger.setLevel(logging.INFO)
 23 | 
 24 | 
 25 | class OverwriteFile(BaseTool):
 26 |     """Tool to overwrite an existing file or create a new one with content."""
 27 | 
 28 |     TOOL_NAME = "overwrite_file"
 29 |     TOOL_DESCRIPTION = f"""Use this tool when you want to write content verbatim to a file, either overwriting an existing file or creating a new one.
 30 | 
 31 | For existing files:
 32 | - You MUST have called the `open_file` tool to view the file before over-writing it
 33 | - This is to make sure we're not over-writing anything of value that needs to be kept
 34 | - The entire content will be replaced verbatim with the new content provided
 35 | 
 36 | For new files:
 37 | - 'Overwriting' a not-yet-existing file will create it
 38 | - The file will be automatically opened in the context window after creation
 39 | 
 40 | Very important notes:
 41 | - The content you provide to this tool will be that file's new content. You must make sure to include absolutely everything you still need
 42 | - Do NOT "fold" any code sections because this will cause errors. Instead, write out everything verbatim.
 43 | 
 44 | - DO NOT, under any circumstances, call this tool for a file edit that exceeds about 500 lines. It will be slow, inefficient, costly and error-prone. For these types of large-file edits, you should seek to use more efficient editing tools.
 45 | - You do not need to write out the file ahed of time before invoking this tool.
 46 | """
 47 | 
 48 |     filepath: str = Field(
 49 |         ...,
 50 |         description="The full absolute filepath of the file to write. For existing files, must be already open in context window.",
 51 |     )
 52 |     full_unabridged_new_content: str = Field(
 53 |         ...,
 54 |         description="The full content to write to the file, which will entirely replace any existing content.",
 55 |     )
 56 | 
 57 |     def __init__(self, calling_agent: AgentInterface, **data):
 58 |         super().__init__(calling_agent=calling_agent, **data)
 59 | 
 60 |     @classmethod
 61 |     async def args_str_to_dict(
 62 |         cls, args_str: str, arg_format: ArgFormat = ArgFormat.XML
 63 |     ) -> tuple[dict | None, str | None]:
 64 |         if arg_format == ArgFormat.XML:
 65 |             # Carefully extract the content, with the assumption that there _will_
 66 |             # be conflicting tags.
 67 |             # First, manually get the content between <filepath>
 68 |             filepath_pattern = f"<filepath>(.*?)</filepath>"
 69 |             filepath_match = re.search(filepath_pattern, args_str)
 70 |             filepath = filepath_match.group(1) if filepath_match else None
 71 |             if not filepath:
 72 |                 return None, "Could not parse filepath"
 73 | 
 74 |             # Find the first <content> opening tag
 75 |             content = extract_between_patterns(
 76 |                 args_str, "<full_unabridged_new_content>", "</full_unabridged_new_content>"
 77 |             )
 78 |             if not content:
 79 |                 return None, "Could not parse file content"
 80 | 
 81 |             return dict(filepath=filepath, full_unabridged_new_content=content), None
 82 |         else:
 83 |             return await json_str_to_dict(args_str, guide_obj=cls)
 84 | 
 85 |     async def run(self) -> ToolResult:
 86 |         try:
 87 |             path = Path(self.filepath)
 88 |             event_bus = await EventBus.get_instance()
 89 | 
 90 |             # Check if file exists
 91 |             file_exists = path.exists()
 92 | 
 93 |             if not file_exists:
 94 |                 # Create directory structure if needed
 95 |                 path.parent.mkdir(parents=True, exist_ok=True)
 96 | 
 97 |                 # For new files, write content first
 98 |                 try:
 99 |                     path.write_text(self.full_unabridged_new_content)
100 | 
101 |                     event = FileEvent(
102 |                         type=EventType.FILE_EVENT,
103 |                         content=self.full_unabridged_new_content,
104 |                         path=str(path),
105 |                         operation=FileOperation.OPEN,
106 |                     )
107 | 
108 |                     await event_bus.publish(event, self._calling_agent._id)
109 | 
110 |                     return ToolResult(
111 |                         tool_name=self.TOOL_NAME,
112 |                         success=True,
113 |                         output=f"Successfully created new file {path}",
114 |                     )
115 |                 except Exception as e:
116 |                     return ToolResult(
117 |                         tool_name=self.TOOL_NAME,
118 |                         success=False,
119 |                         errors=f"Failed to create new file {path}: {str(e)}",
120 |                     )
121 |             else:
122 |                 # For existing files, verify it's open first
123 |                 result = await edit_preflight_check(
124 |                     path, self.TOOL_NAME, self._calling_agent
125 |                 )
126 |                 if result:
127 |                     return result
128 | 
129 |                 prev_content = path.read_text()
130 | 
131 |                 # Now write new content
132 |                 try:
133 |                     path.write_text(self.full_unabridged_new_content)
134 | 
135 |                     diff_content, content_hash = generate_edit_event_content(
136 |                         prev_content, self.full_unabridged_new_content, str(path)
137 |                     )
138 | 
139 |                     event = FileEvent(
140 |                         type=EventType.FILE_EVENT,
141 |                         content=diff_content,
142 |                         path=str(path),
143 |                         operation=FileOperation.EDIT,
144 |                         content_hash=content_hash,
145 |                         mtime=path.stat().st_mtime,
146 |                     )
147 | 
148 |                     await event_bus.publish(event, self._calling_agent._id)
149 | 
150 |                     return ToolResult(
151 |                         tool_name=self.TOOL_NAME,
152 |                         success=True,
153 |                         output=f"Successfully overwrote content of {path}",
154 |                     )
155 |                 except Exception as e:
156 |                     return ToolResult(
157 |                         tool_name=self.TOOL_NAME,
158 |                         success=False,
159 |                         errors=f"Failed to write to file {path}: {str(e)}",
160 |                     )
161 | 
162 |         except Exception as e:
163 |             return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
164 | 
165 |     @classmethod
166 |     def generate_examples(cls) -> list[tuple["OverwriteFile", ToolResult]]:
167 |         from ...agents.implementations import DemoAgent
168 | 
169 |         return [
170 |             # Example 1: Create new file
171 |             (
172 |                 cls(
173 |                     calling_agent=DemoAgent(),
174 |                     filepath="/home/agent/workdir/new_file.txt",
175 |                     full_unabridged_new_content="Content for the new file",
176 |                 ),
177 |                 ToolResult(
178 |                     tool_name=cls.TOOL_NAME,
179 |                     success=True,
180 |                     output="Successfully created new file /home/agent/workdir/new_file.txt",
181 |                 ),
182 |             ),
183 |             # Example 2: Overwrite existing file
184 |             (
185 |                 cls(
186 |                     calling_agent=DemoAgent(),
187 |                     filepath="/home/agent/workdir/example.txt",
188 |                     full_unabridged_new_content="New content for existing file",
189 |                 ),
190 |                 ToolResult(
191 |                     tool_name=cls.TOOL_NAME,
192 |                     success=True,
193 |                     output="Successfully overwrote content of /home/agent/workdir/example.txt",
194 |                 ),
195 |             ),
196 |         ]
197 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/edit_tools/utils.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import hashlib
  7 | import difflib
  8 | 
  9 | from pathlib import Path
 10 | from datetime import datetime, timedelta
 11 | 
 12 | from ...types.tool_types import ToolResult
 13 | from ...types.agent_types import AgentInterface, InheritanceFlags
 14 | from ...events.event_bus_utils import is_file_open, get_latest_file_event
 15 | 
 16 | 
 17 | async def edit_preflight_check(
 18 |     path: Path, tool_name: str, calling_agent: AgentInterface
 19 | ) -> ToolResult | None:
 20 |     inherits_parent_files = (
 21 |         InheritanceFlags.OPEN_FILES in calling_agent.INHERITANCE.flags
 22 |     )
 23 | 
 24 |     file_open: bool = await is_file_open(str(path), calling_agent._id)
 25 |     if inherits_parent_files and not file_open:
 26 |         file_open = await is_file_open(str(path), calling_agent._parent_id)
 27 | 
 28 |     # Verify file is open
 29 |     if not file_open:
 30 |         return ToolResult(
 31 |             tool_name=tool_name,
 32 |             success=False,
 33 |             errors=f"File {path} must be opened first using the open_file tool",
 34 |         )
 35 | 
 36 |     eps = timedelta(seconds=0.5)
 37 |     latest_file_event = await get_latest_file_event(
 38 |         str(path), calling_agent._id, exclude_close=True
 39 |     )
 40 |     # Assumes agent runs are blocking (i.e. all agent file events will be newer
 41 |     # than parent file events)
 42 |     if inherits_parent_files and not latest_file_event:
 43 |         latest_file_event = await get_latest_file_event(
 44 |             str(path),
 45 |             calling_agent._parent_id,
 46 |             exclude_close=True,
 47 |         )
 48 | 
 49 |     last_mod = datetime.fromtimestamp(path.stat().st_mtime)
 50 |     if not latest_file_event or last_mod > latest_file_event.timestamp + eps:
 51 |         last_viewed = (
 52 |             latest_file_event.timestamp.strftime("%Y-%m-%d %H:%M:%S")
 53 |             if latest_file_event
 54 |             else "Never"
 55 |         )
 56 |         return ToolResult(
 57 |             tool_name=tool_name,
 58 |             success=False,
 59 |             errors=(
 60 |                 f"File {path} was changed at {last_mod.strftime('%Y-%m-%d %H:%M:%S')}, "
 61 |                 f"which is after you last viewed or edited it at {last_viewed}."
 62 |                 "Please view it again to get its latest contents before making your edit."
 63 |             ),
 64 |         )
 65 | 
 66 | 
 67 | def generate_edit_event_content(
 68 |     old_content: str, new_content: str, path: str
 69 | ) -> tuple[str, str]:
 70 |     """Generate a diff between old and new content for file events.
 71 | 
 72 |     Returns:
 73 |         tuple[str, str]: A tuple containing (content_for_event, content_hash)
 74 |         where content_for_event contains the diff and content_hash is the hash of new_content
 75 |     """
 76 |     if not old_content and new_content:
 77 |         # For new files, return the full content
 78 |         content_hash = hashlib.sha256(new_content.encode()).hexdigest()
 79 |         return new_content, content_hash
 80 | 
 81 |     # Generate unified diff
 82 |     old_lines = old_content.splitlines()
 83 |     new_lines = new_content.splitlines()
 84 | 
 85 |     diff = list(
 86 |         difflib.unified_diff(
 87 |             old_lines,
 88 |             new_lines,
 89 |             fromfile=f"a/{path}",
 90 |             tofile=f"b/{path}",
 91 |             lineterm="",
 92 |         )
 93 |     )
 94 | 
 95 |     if diff:
 96 |         diff_content = "\n".join(diff)
 97 |     else:
 98 |         diff_content = "No changes"
 99 | 
100 |     content_hash = hashlib.sha256(new_content.encode()).hexdigest()
101 |     return diff_content, content_hash
102 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/file_tools.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import logging
  7 | 
  8 | from pathlib import Path
  9 | from pydantic import Field
 10 | 
 11 | from .base_tool import BaseTool
 12 | from ..events import EventBus
 13 | from ..events.event_bus_utils import get_open_file_set
 14 | from ..types.tool_types import ToolResult
 15 | from ..types.event_types import EventType, FileOperation, FileEvent
 16 | from ..types.agent_types import AgentInterface
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | logger.setLevel(logging.INFO)
 20 | 
 21 | 
 22 | class OpenFile(BaseTool):
 23 |     TOOL_NAME = "open_files"
 24 |     TOOL_DESCRIPTION = """A file viewer tool that allows you to see the contents of one or more files in your context window.
 25 | 
 26 | Note, you should use /home/agent/workdir as your working directory if possible, althogh specifying file paths outside this directory will work.
 27 | 
 28 | Features:
 29 | - View multiple files at once
 30 | - Optional line number display for easier reference
 31 | - Automatic warning for non-text files
 32 | 
 33 | VERY IMPORTANT NOTE: you should only open files that are in a plain text format (e.g. something that you might open in a code editor). Opening media files, binary formats or any other non-text format will lead to unpredictable results.
 34 | """
 35 | 
 36 |     file_paths: list[str] = Field(
 37 |         ...,
 38 |         description="A list of one or more absolute filepaths to add to open in your context window",
 39 |     )
 40 |     show_line_numbers: bool = Field(
 41 |         False,
 42 |         description="When True, displays line numbers in the left margin of the file for easier reference.",
 43 |     )
 44 | 
 45 |     def __init__(self, calling_agent: AgentInterface, **data):
 46 |         super().__init__(calling_agent=calling_agent, **data)
 47 | 
 48 |     async def run(self) -> ToolResult:
 49 |         try:
 50 |             output_strings = []
 51 |             warnings = []
 52 |             total_lines = 0
 53 |             for fpath in self.file_paths:
 54 |                 path = Path(fpath)
 55 |                 if not path.exists():
 56 |                     warnings.append(f"File path: {path} does not exist")
 57 |                     continue
 58 | 
 59 |                 output_strings.append(f"The file at {path} was opened successfully.")
 60 | 
 61 |                 file_content = path.read_text()
 62 |                 event = FileEvent(
 63 |                     type=EventType.FILE_EVENT,
 64 |                     content=file_content,
 65 |                     path=str(fpath),
 66 |                     operation=FileOperation.OPEN,
 67 |                     metadata={"show_line_numbers": self.show_line_numbers},
 68 |                 )
 69 |                 total_lines += len(file_content.splitlines())
 70 | 
 71 |                 event_bus = await EventBus.get_instance()
 72 |                 await event_bus.publish(event, self._calling_agent._id)
 73 | 
 74 |             if total_lines > 750:
 75 |                 warnings.append(f"You have added {total_lines} of content to the context, which is quite high. If this file is not immediately relevant to the task at hand, you should make sure to close it (and any other long files) with the close_file tool.")
 76 | 
 77 |             return ToolResult(
 78 |                 tool_name=self.TOOL_NAME,
 79 |                 success=True,
 80 |                 output="\n".join(output_strings) if output_strings else None,
 81 |                 warnings="\n".join(warnings) if warnings else None,
 82 |             )
 83 |         except Exception as e:
 84 |             return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
 85 | 
 86 |     @classmethod
 87 |     def generate_examples(cls) -> list[tuple["OpenFile", ToolResult]]:
 88 |         from ..agents.implementations import DemoAgent
 89 | 
 90 |         return [
 91 |             (
 92 |                 cls(
 93 |                     calling_agent=DemoAgent(),
 94 |                     file_paths=["/home/agent/workdir/example.txt"],
 95 |                     show_line_numbers=False,
 96 |                 ),
 97 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True),
 98 |             ),
 99 |             (
100 |                 cls(
101 |                     calling_agent=DemoAgent(),
102 |                     file_paths=["/tmp/example.txt", "/home/agent/workdir/new.txt"],
103 |                     show_line_numbers=True,
104 |                 ),
105 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True),
106 |             ),
107 |         ]
108 | 
109 | 
110 | class CloseFile(BaseTool):
111 |     TOOL_NAME = "close_files"
112 |     TOOL_DESCRIPTION = """Close one or more open files to clear up space in the context window.
113 | 
114 | Note that you can call this tool with the empty list [] as the file_paths to close all open files.
115 | """
116 | 
117 |     file_paths: list[str] = Field(
118 |         ...,
119 |         description="A list of one or more absolute file paths to close. If this is the empty list, then all files will be closed",
120 |     )
121 | 
122 |     def __init__(self, calling_agent: AgentInterface, **data):
123 |         super().__init__(calling_agent=calling_agent, **data)
124 | 
125 |     async def run(self) -> ToolResult:
126 |         try:
127 |             event_bus = await EventBus.get_instance()
128 | 
129 |             if len(self.file_paths) == 0:
130 |                 open_files = await get_open_file_set(self._calling_agent._id)
131 |                 for open_event in open_files:
132 |                     close_event = FileEvent(
133 |                         type=EventType.FILE_EVENT,
134 |                         content="",
135 |                         path=open_event.path,
136 |                         operation=FileOperation.CLOSE,
137 |                     )
138 |                     await event_bus.publish(close_event, self._calling_agent._id)
139 |                 return ToolResult(
140 |                     tool_name=self.TOOL_NAME,
141 |                     success=True,
142 |                 )
143 | 
144 |             for fpath in self.file_paths:
145 |                 close_event = FileEvent(
146 |                     type=EventType.FILE_EVENT,
147 |                     content="",
148 |                     path=fpath,
149 |                     operation=FileOperation.CLOSE,
150 |                 )
151 |                 await event_bus.publish(close_event, self._calling_agent._id)
152 | 
153 |             return ToolResult(
154 |                 tool_name=self.TOOL_NAME,
155 |                 success=True,
156 |             )
157 |         except Exception as e:
158 |             return ToolResult(tool_name=self.TOOL_NAME, success=False, errors=str(e))
159 | 
160 |     @classmethod
161 |     def generate_examples(cls) -> list[tuple["CloseFile", ToolResult]]:
162 |         from ..agents.implementations import DemoAgent
163 | 
164 |         return [
165 |             (
166 |                 cls(
167 |                     calling_agent=DemoAgent(),
168 |                     file_paths=["/home/agent/workdir/example.txt"],
169 |                 ),
170 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True),
171 |             ),
172 |             (
173 |                 cls(
174 |                     calling_agent=DemoAgent(),
175 |                     file_paths=[],
176 |                 ),
177 |                 ToolResult(tool_name=cls.TOOL_NAME, success=True),
178 |             ),
179 |         ]
180 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/reasoning_structures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/tools/reasoning_structures/__init__.py


--------------------------------------------------------------------------------
/base_agent/src/tools/reasoning_structures/coding.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | A reasoning structure for coding.
 8 | """
 9 | import logging
10 | 
11 | from pydantic import PrivateAttr
12 | 
13 | from .sequential import Step, ToolBasedReasoningStructure, _make_id
14 | from ...types.tool_types import ToolResult
15 | from ...types.agent_types import AgentInterface
16 | 
17 | logger = logging.getLogger(__name__)
18 | logger.setLevel(logging.INFO)
19 | 
20 | 
21 | class CodingReasoningStructure(ToolBasedReasoningStructure):
22 | 
23 |     TOOL_NAME = "coding_reasoning_structure"
24 |     TOOL_DESCRIPTION = """Apply this reasoning structure when you detect that you have a non-trivial coding implementation task that requires a methodical approach involving initial exploration, implementation, verification and cleanup to complete well.
25 | 
26 | Do not call this tool if merely verifying, testing or if the task at hand is quick and does not require such rigour.
27 | 
28 | This reasoning structure will guide you through good software engineering practices, and ensure that no steps have been missed out.
29 | """
30 |     _steps: list[Step] = PrivateAttr(default_factory=lambda: [
31 |         Step(
32 |             identifier=_make_id(),
33 |             instruction="Explore the project to a) locate all useful documentation (README.md files, common likely MD documentation files, etc), b) all files that may relate to your programming instructions, c) identify module-level and file-level design patterns and conventions.",
34 |             done_description="You have viewed each of these files, made sure to close irrelevant or long files, and taken notes or summaries. Note that for greenfiled projects, this step may complete trivially.",
35 |             failed_description="Files could not be opened for some reason, or the project location is unclear.",
36 |         ),
37 |         Step(
38 |             identifier=_make_id(),
39 |             instruction="Carefully implement the solution completely and thoroughly. Make sure you observe any existing stylistic conventions, and effectively re-use existing design patterns or modules to avoid duplicating functionality.",
40 |             done_description="A first pass at the code implementation has been implemented, with tests not yet having been run.",
41 |             failed_description="You have got stuck trying to get dependencies set up, getting mocks and fixtures set up, or have otherwise digressed from the core code implementation.",
42 |         ),
43 |         Step(
44 |             identifier=_make_id(),
45 |             instruction="Test the implementation end-to-end, favouring test scripts instead of test frameworks. If this is not an option or the project already has a test framework set up, then use that.",
46 |             done_description="You have ensured that the code is valid, hasn't introduced any regressions and works as intended",
47 |             failed_description="You have got stuck writing TDD loops, getting dependencies set up, getting mocks and fixtures set up",
48 |         ),
49 |         Step(
50 |             identifier=_make_id(),
51 |             instruction="Clean up: remove any temporary test scripts, toy implementations or other scaffolding. Check that all documentation and docstrings are up-to-date.",
52 |             done_description="All temporary files have been removed, and documentation updated.",
53 |         ),
54 |     ])
55 | 
56 |     def __init__(self, calling_agent: AgentInterface, **data):
57 |         super().__init__(calling_agent=calling_agent, **data)
58 | 
59 |     @classmethod
60 |     def generate_examples(cls) -> list[tuple["CodingReasoningStructure", ToolResult]]:
61 |         from ...agents.implementations import DemoAgent
62 | 
63 |         return [
64 |             (
65 |                 cls(calling_agent=DemoAgent()),
66 |                 ToolResult(
67 |                     tool_name=cls.TOOL_NAME,
68 |                     success=True,
69 |                     output="The first step in the meta improvement process is: ...",
70 |                 ),
71 |             ),
72 |         ]
73 | 


--------------------------------------------------------------------------------
/base_agent/src/tools/reasoning_structures/sequential_subagents.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """
 7 | A proof of concept / template for a sub-agent based reasoning structure, where
 8 | each step is a hard-coded sub-agent call.
 9 | """
10 | import logging
11 | 
12 | from uuid import uuid4
13 | 
14 | from ..base_tool import BaseTool
15 | from ...types.tool_types import ToolResult
16 | from ...types.agent_types import AgentInterface
17 | from ...types.llm_types import FCI, ToolCallContent
18 | from ...agents.implementations.coder import CodingAgent
19 | 
20 | logger = logging.getLogger(__name__)
21 | logger.setLevel(logging.INFO)
22 | 
23 | 
24 | class SubagentBasedReasoningStructure(BaseTool):
25 | 
26 |     TOOL_NAME = "example_subagent_reasoning_structure"
27 |     TOOL_DESCRIPTION = """Reason through a fixed list of points sequentially."""
28 | 
29 |     def __init__(self, calling_agent: AgentInterface, **data):
30 |         super().__init__(calling_agent=calling_agent, **data)
31 | 
32 |     async def run(self) -> ToolResult:
33 |         parent_agent: AgentInterface = self._calling_agent
34 | 
35 |         try:
36 |             await parent_agent._handle_agent_call(ToolCallContent(
37 |                 call_id=f"agent_{uuid4().hex[:8]}",
38 |                 tool_name=CodingAgent.AGENT_NAME,
39 |                 tool_args=dict(
40 |                     programming_instructions="Print 'a' in a file called 'a.txt'",
41 |                 ),
42 |                 call_type=FCI.UNCONSTRAINED,  # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers.
43 |             ))
44 | 
45 |             await parent_agent._handle_agent_call(ToolCallContent(
46 |                 call_id=f"agent_{uuid4().hex[:8]}",
47 |                 tool_name=CodingAgent.AGENT_NAME,
48 |                 tool_args=dict(
49 |                     programming_instructions="Print 'b' in a file called 'b.txt'",
50 |                 ),
51 |                 call_type=FCI.UNCONSTRAINED,  # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers.
52 |             ))
53 | 
54 |             await parent_agent._handle_agent_call(ToolCallContent(
55 |                 call_id=f"agent_{uuid4().hex[:8]}",
56 |                 tool_name=CodingAgent.AGENT_NAME,
57 |                 tool_args=dict(
58 |                     programming_instructions="Print 'c' in a file called 'c.txt'",
59 |                 ),
60 |                 call_type=FCI.UNCONSTRAINED,  # this must always be UNCONSTRAINED when forcing otherwise it causes 400 errors with the providers.
61 |             ))
62 | 
63 |             return ToolResult(
64 |                 tool_name=self.TOOL_NAME,
65 |                 success=True,
66 |                 output="Completed successfully"
67 |             )
68 | 
69 |         except Exception as e:
70 |             return ToolResult(
71 |                 tool_name=self.TOOL_NAME,
72 |                 success=False,
73 |                 errors=f"Error in sequential reasoning: {e}"
74 |             )
75 | 
76 |     @classmethod
77 |     def generate_examples(cls) -> list[tuple["SubagentBasedReasoningStructure", ToolResult]]:
78 |         from ...agents.implementations import DemoAgent
79 | 
80 |         return [
81 |             (
82 |                 cls(calling_agent=DemoAgent()),
83 |                 ToolResult(
84 |                     tool_name=cls.TOOL_NAME,
85 |                     success=True,
86 |                     output="Successfully did the ABC",
87 |                 ),
88 |             ),
89 |         ]
90 | 


--------------------------------------------------------------------------------
/base_agent/src/types/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/types/__init__.py


--------------------------------------------------------------------------------
/base_agent/src/types/common.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | from enum import Enum
 7 | 
 8 | 
 9 | class ArgFormat(str, Enum):
10 |     """Tool argument formats"""
11 | 
12 |     XML = "xml"
13 |     JSON = "json"
14 | 


--------------------------------------------------------------------------------
/base_agent/src/types/event_types.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | import hashlib
 7 | 
 8 | from enum import Enum
 9 | from pathlib import Path
10 | from datetime import datetime
11 | from dataclasses import field, dataclass
12 | 
13 | 
14 | class EventType(Enum):
15 |     ASSISTANT_MESSAGE = "assistant_message"
16 |     ASSISTANT_REASONING = "assistant_reasoning"
17 |     TOOL_CALL = "tool_call"
18 |     TOOL_RESULT = "tool_result"
19 |     AGENT_CALL = "agent_call"
20 |     AGENT_RESULT = "agent_result"
21 |     CORE_PROMPT_UPDATE = "core_prompt_update"
22 |     SYSTEM_PROMPT_UPDATE = "system_prompt_update"
23 |     FILE_EVENT = "file_event"
24 |     APPLICATION_ERROR = "application_error"
25 |     APPLICATION_WARNING = "application_warning"
26 |     PROBLEM_STATEMENT = "problem_statement"  # initial problem statement
27 |     EXTERNAL_MESSAGE = "external_message"  # subsequent update messages
28 |     OVERSEER_NOTIFICATION = "overseer_notification"
29 |     OVERSEER_UPDATE = "overseer_update"  # for debugging
30 |     BUDGET_INFO = "budget_info"
31 |     TIMEOUT = "timeout"
32 |     COST_LIMIT = "cost_limit"
33 | 
34 | 
35 | @dataclass
36 | class Event:
37 |     """Base class for all events in the stream"""
38 | 
39 |     type: EventType
40 |     content: str
41 |     metadata: dict = field(default_factory=dict)
42 |     timestamp: datetime = field(default_factory=datetime.now)
43 | 
44 | 
45 | class FileOperation(Enum):
46 |     OPEN = "open"
47 |     CLOSE = "close"
48 |     EDIT = "edit"
49 | 
50 | 
51 | @dataclass
52 | class FileEvent:
53 |     """Special event for file operations"""
54 | 
55 |     type: EventType
56 |     content: str  # NOTE: this is the formatted content, not just the raw file content (e.g. with line numbers, content hash, lsp diagnostics, etc)
57 |     operation: FileOperation
58 |     path: str
59 | 
60 |     timestamp: datetime = field(default_factory=datetime.now)
61 |     metadata: dict = field(default_factory=dict)  # NOTE: unused
62 | 
63 |     mtime: float = field(default=0.0)
64 |     content_hash: str = field(default="")
65 |     diff: str | None = None
66 |     # lsp_diagnosdics: list = field(default_factory=list)
67 | 
68 |     def __post_init__(self):
69 |         """Compute hash on initialization if not provided"""
70 |         if not self.content_hash and self.content:
71 |             self.content_hash = hashlib.sha256(self.content.encode()).hexdigest()
72 | 
73 |         if self.mtime == 0.0:
74 |             try:
75 |                 self.mtime = Path(self.path).stat().st_mtime
76 |             except Exception:
77 |                 pass
78 | 
79 |     def __hash__(self):
80 |         return hash((self.type, self.operation, self.path, self.content_hash))
81 | 
82 |     def __eq__(self, other):
83 |         if not isinstance(other, FileEvent):
84 |             return False
85 |         return (
86 |             self.type == other.type
87 |             and self.operation == other.operation
88 |             and self.path == other.path
89 |             and self.content_hash == other.content_hash
90 |         )
91 | 


--------------------------------------------------------------------------------
/base_agent/src/types/tool_types.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import os
  7 | 
  8 | from abc import ABC, abstractmethod
  9 | from typing import Any, ClassVar
 10 | from pydantic import BaseModel, Field
 11 | from ..schemas import dumps
 12 | from ..types.common import ArgFormat
 13 | 
 14 | 
 15 | class ToolResult(BaseModel):
 16 |     """Represents the result of a tool execution."""
 17 | 
 18 |     tool_name: str
 19 |     success: bool
 20 |     duration: float = 0.0  # on tool error paths, duration is often 0
 21 |     output: dict[str, Any] | str | None = None
 22 |     warnings: str | None = None
 23 |     errors: str | None = None
 24 |     invocation_id: str = Field(default_factory=lambda: os.urandom(4).hex())
 25 | 
 26 |     def __str__(self):
 27 |         str_output = self.output if isinstance(self.output, str) else None
 28 |         if isinstance(self.output, dict):
 29 |             str_output = dumps(self.output, ArgFormat.XML, indent=2)
 30 | 
 31 |         tool_response_str = "<TOOL_RESPONSE>"
 32 |         tool_response_str += (
 33 |             f"\n<STATUS>{'SUCCESS' if self.success else 'FAILURE'}</STATUS>"
 34 |         )
 35 |         if str_output is not None:
 36 |             tool_response_str += f"\n<OUTPUT>{str_output}</OUTPUT>"
 37 |         if self.warnings is not None:
 38 |             tool_response_str += f"\n<WARNINGS>{self.warnings}</WARNINGS>"
 39 |         if self.errors is not None:
 40 |             tool_response_str += f"\n<ERRORS>{self.errors}</ERRORS>"
 41 |         if self.duration is not None:
 42 |             tool_response_str += f"\n<DURATION>{self.duration:.3f}</DURATION>"
 43 |         tool_response_str += "\n</TOOL_RESPONSE>"
 44 | 
 45 |         return tool_response_str
 46 | 
 47 |     def to_plain_string(self):
 48 |         str_output = self.output if isinstance(self.output, str) else None
 49 |         if isinstance(self.output, dict):
 50 |             str_output = dumps(self.output, ArgFormat.JSON, indent=2)
 51 | 
 52 |         tool_response_str = f"{self.tool_name} response:"
 53 |         tool_response_str += f"\nSuccess: {self.success}"
 54 |         if str_output is not None:
 55 |             tool_response_str += f"\nResult: {str_output}"
 56 |         if self.warnings is not None:
 57 |             tool_response_str += f"\nWarnings: {self.warnings}"
 58 |         if self.errors is not None:
 59 |             tool_response_str += f"\nErrors: {self.errors}"
 60 |         if self.duration is not None:
 61 |             tool_response_str += f"\nDuration: {self.duration:.3f}"
 62 | 
 63 |         return tool_response_str
 64 | 
 65 | 
 66 | class ToolInterface(BaseModel, ABC):
 67 |     """Abstract interface for all tools"""
 68 | 
 69 |     # Class variables
 70 |     TOOL_NAME: ClassVar[str]
 71 |     TOOL_DESCRIPTION: ClassVar[str]
 72 |     EPHEMERAL: ClassVar[bool] = False
 73 | 
 74 |     class Config:
 75 |         extra = "forbid"
 76 | 
 77 |     @abstractmethod
 78 |     async def run(self) -> ToolResult:
 79 |         """Execute the tool's functionality"""
 80 |         pass
 81 | 
 82 |     @classmethod
 83 |     @abstractmethod
 84 |     def generate_examples(cls) -> list[tuple["BaseTool", ToolResult]]:
 85 |         """Generate example uses of the tool with their expected outputs"""
 86 |         pass
 87 | 
 88 |     @classmethod
 89 |     @abstractmethod
 90 |     def to_prompt_format(cls, arg_format: ArgFormat = ArgFormat.XML) -> str:
 91 |         """Convert the tool definition to XML format for the unconstrained tool use prompt."""
 92 |         pass
 93 | 
 94 |     @classmethod
 95 |     @abstractmethod
 96 |     def to_plain_prompt_format(cls, arg_format: ArgFormat = ArgFormat.JSON) -> str:
 97 |         """Convert the tool definition to a formatted string for the constrained tool use prompt.
 98 | 
 99 |         NOTE: most providers use JSON-like syntax in their prompts, so
100 |         generating few-shot examples like this tends to work better.
101 |         """
102 |         pass
103 | 


--------------------------------------------------------------------------------
/base_agent/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/base_agent/src/utils/__init__.py


--------------------------------------------------------------------------------
/base_agent/src/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | import random
 7 | 
 8 | from datetime import datetime, timedelta
 9 | 
10 | from ..types.llm_types import TokenUsage
11 | from ..types.agent_types import AgentMetrics
12 | 
13 | 
14 | def make_random_agent_metrics(
15 |     tools_enabled: bool = True,
16 |     agents_enabled: bool = True,
17 |     min_duration_seconds: int = 1,
18 |     max_duration_seconds: int = 300,
19 |     base_prompt_tokens: int = 500,
20 |     token_variance: float = 0.3,
21 |     cache_hit_rate: float = 0.4,
22 |     cache_write_rate: float = 0.3,
23 |     cost_per_1k_tokens: float = 0.002,
24 |     seed: int = 42  # Added seed parameter
25 | ) -> AgentMetrics:
26 |     """
27 |     Generate random but plausible agent metrics deterministically.
28 | 
29 |     Args:
30 |         tools_enabled: Whether tools are enabled for this agent
31 |         agents_enabled: Whether sub-agents are enabled for this agent
32 |         min_duration_seconds: Minimum execution duration in seconds
33 |         max_duration_seconds: Maximum execution duration in seconds
34 |         base_prompt_tokens: Base number of prompt tokens to vary around
35 |         token_variance: How much to vary token counts (as proportion of base)
36 |         cache_hit_rate: Proportion of tokens that should be cached hits
37 |         cache_write_rate: Proportion of uncached tokens that should be written to cache
38 |         cost_per_1k_tokens: Cost per 1000 tokens in dollars
39 |         seed: Random seed for deterministic output
40 | 
41 |     Returns:
42 |         AgentMetrics object with randomized but plausible values
43 |     """
44 |     # Set the random seed for reproducibility
45 |     random.seed(seed)
46 | 
47 |     # Use a fixed base time instead of datetime.now()
48 |     base_time = datetime(2025, 1, 1, 0, 0, 0)  # Fixed starting point
49 |     start_time = base_time - timedelta(days=random.randint(0, 7))
50 |     duration = random.uniform(min_duration_seconds, max_duration_seconds)
51 |     end_time = start_time + timedelta(seconds=duration)
52 | 
53 |     # Calculate base token counts with some variance
54 |     variance_factor = 1 + random.uniform(-token_variance, token_variance)
55 |     total_prompt_tokens = int(base_prompt_tokens * variance_factor)
56 | 
57 |     # Calculate cached vs uncached split
58 |     cached_tokens = int(total_prompt_tokens * cache_hit_rate)
59 |     uncached_tokens = total_prompt_tokens - cached_tokens
60 | 
61 |     # Calculate cache writes
62 |     cache_writes = int(uncached_tokens * cache_write_rate)
63 | 
64 |     # Generate completion tokens (typically 20-80% of prompt tokens)
65 |     completion_tokens = int(total_prompt_tokens * random.uniform(0.2, 0.8))
66 | 
67 |     # Calculate tool and agent calls if enabled
68 |     tool_calls = 0
69 |     agent_calls = 0
70 | 
71 |     if tools_enabled:
72 |         # Typically 1-5 tool calls per interaction
73 |         tool_calls = random.randint(1, 5)
74 | 
75 |     if agents_enabled:
76 |         # Typically 0-3 agent calls per interaction
77 |         agent_calls = random.randint(0, 3)
78 | 
79 |     # Calculate total cost
80 |     total_tokens = total_prompt_tokens + completion_tokens
81 |     cost = (total_tokens / 1000) * cost_per_1k_tokens
82 | 
83 |     return AgentMetrics(
84 |         start_time=start_time,
85 |         end_time=end_time,
86 |         token_usage=TokenUsage(
87 |             uncached_prompt_tokens=uncached_tokens - cache_writes,
88 |             cache_write_prompt_tokens=cache_writes,
89 |             cached_prompt_tokens=cached_tokens,
90 |             completion_tokens=completion_tokens,
91 |         ),
92 |         cost=cost,
93 |         tool_calls=tool_calls,
94 |         agent_calls=agent_calls,
95 |     )
96 | 


--------------------------------------------------------------------------------
/base_agent/src/utils/parsing.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | Some parsing utilities.
  8 | 
  9 | This module provides utilities for parsing various types of data,
 10 | particularly focusing on numerical parsing from strings.
 11 | """
 12 | 
 13 | import re
 14 | 
 15 | from typing import Optional, Literal
 16 | 
 17 | def extract_before_last(text: str, pattern: str, keep_pattern: bool = False) -> str:
 18 |     last_pos = text.rfind(pattern)
 19 |     offset = len(pattern) if keep_pattern else 0
 20 |     return text[:last_pos + offset] if last_pos != -1 else ""
 21 | 
 22 | def extract_after_last(text: str, pattern: str, keep_pattern: bool = False) -> str:
 23 |     last_pos = text.rfind(pattern)
 24 |     offset = 0 if keep_pattern else len(pattern)
 25 |     return text[last_pos + offset:] if last_pos != -1 else ""
 26 | 
 27 | 
 28 | def extract_after_first(text: str, pattern: str, keep_pattern: bool = False) -> str:
 29 |     first_pos = text.find(pattern)
 30 |     offset = 0 if keep_pattern else len(pattern)
 31 |     return text[first_pos + offset:] if first_pos != -1 else ""
 32 | 
 33 | 
 34 | def extract_between_patterns(
 35 |     s: str,
 36 |     pattern_a: str,
 37 |     pattern_b: str,
 38 |     a_occurrence: Literal["first"] | Literal["last"] = "first",
 39 |     b_occurrence: Literal["first"] | Literal["last"] = "last",
 40 | ) -> str | None:
 41 |     # Validate both occurrences upfront
 42 |     if a_occurrence not in ("first", "last"):
 43 |         raise ValueError("Invalid value for a_occurrence. Use 'first' or 'last'.")
 44 |     if b_occurrence not in ("first", "last"):
 45 |         raise ValueError("Invalid value for b_occurrence. Use 'first' or 'last'.")
 46 | 
 47 |     # Determine the index for `pattern_a`
 48 |     if a_occurrence == "first":
 49 |         start_index = s.find(pattern_a)
 50 |     else:  # "last"
 51 |         start_index = s.rfind(pattern_a)
 52 | 
 53 |     if start_index == -1:
 54 |         return None
 55 | 
 56 |     start_index += len(pattern_a)
 57 | 
 58 |     # Determine the index for `pattern_b`
 59 |     if b_occurrence == "first":
 60 |         end_index = s.find(pattern_b)
 61 |     else:  # "last"
 62 |         end_index = s.rfind(pattern_b)
 63 | 
 64 |     if end_index == -1 or end_index <= start_index:
 65 |         return None
 66 | 
 67 |     return s[start_index:end_index]
 68 | 
 69 | 
 70 | def parse_number_from_string(
 71 |     answer: str,
 72 | ) -> tuple[bool, Optional[float], Optional[str]]:
 73 |     cleaned = answer.strip().replace(",", "")
 74 | 
 75 |     # Pattern for a valid number segment
 76 |     number_pattern = r"-?\d*\.?\d+(?:[eE][-+]?\d+)?"
 77 |     match = re.search(number_pattern, cleaned)
 78 | 
 79 |     if not match:
 80 |         return (
 81 |             False,
 82 |             None,
 83 |             "Could not find a number in the answer. Please provide a clear numerical response.",
 84 |         )
 85 | 
 86 |     matched_str = match.group()
 87 |     # Check for multiple decimal points in the matched string
 88 |     if matched_str.count(".") > 1:
 89 |         return (
 90 |             False,
 91 |             None,
 92 |             "Found what looks like a number but couldn't parse it: too many decimal points",
 93 |         )
 94 | 
 95 |     try:
 96 |         value = float(matched_str)
 97 |         full_match = matched_str == cleaned
 98 |         if not full_match:
 99 |             return (
100 |                 True,
101 |                 value,
102 |                 "Warning: Found additional text around the number. In future, try to provide just the number.",
103 |             )
104 |         return True, value, None
105 |     except ValueError as e:
106 |         return (
107 |             False,
108 |             None,
109 |             f"Found what looks like a number but couldn't parse it: {str(e)}",
110 |         )
111 | 


--------------------------------------------------------------------------------
/base_agent/src/utils/stop_tokens.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | # WARNING: while you can read this file, however editing this file directly
 7 | # will stop your generation and you will stop abruptly and fail!
 8 | #
 9 | # If you want to add a new stop token for the next agent itertion, then you
10 | # should append it to this file using a terminal tool like:
11 | # echo 'NEW_STOP_TOKEN = "</NEW_STOP>"' >> tools/stop_tokens.py
12 | #
13 | # If you want to remove one, then make a line edit using something like:
14 | # sed -i '<line_number>d' tools/stop_tokens.py.
15 | # Note that the first token, TOOL_STOP_TOKEN, is on line 14 of this file after
16 | # this comment is counted. To delete it, you'd do:
17 | # sed -i '14d' tools/stop_tokens.py.
18 | 
19 | TOOL_STOP_TOKEN = "</TOOL_CALL>"
20 | AGENT_STOP_TOKEN = "</AGENT_CALL>"
21 | OVERSEER_STOP_TOKEN = "</OVERSEER_JUDGEMENT>"
22 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/__init__.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | """Web server package for callgraph visualization."""
 7 | 
 8 | from .server import run_server
 9 | 
10 | __all__ = ["run_server"]
11 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/static/components/metrics-display.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * MetricsDisplay component
 3 |  */
 4 | 
 5 | import { Component } from "../core.js";
 6 | import { formatters } from "../utils/formatters.js";
 7 | import { store } from "../store.js";
 8 | 
 9 | export class MetricsDisplay extends Component {
10 |   constructor() {
11 |     super();
12 |     this.attachShadow({ mode: "open" });
13 | 
14 |     // Add styles
15 |     const style = document.createElement("style");
16 |     style.textContent = `
17 |       :host {
18 |         display: flex;
19 |         flex-wrap: wrap;
20 |         align-items: center;
21 |         color: white;
22 |       }
23 |       .metric {
24 |         display: flex;
25 |         align-items: center;
26 |         margin-right: 1.5rem;
27 |       }
28 |       .label {
29 |         font-size: 0.75rem;
30 |         text-transform: uppercase;
31 |         letter-spacing: 0.05em;
32 |         color: #d1d5db;
33 |       }
34 |       .value {
35 |         margin-left: 0.5rem;
36 |         font-size: 0.875rem;
37 |         font-weight: 500;
38 |       }
39 |       .cached {
40 |         font-size: 0.75rem;
41 |         color: #d1d5db;
42 |       }
43 |     `;
44 |     this.shadowRoot.appendChild(style);
45 | 
46 |     // Create container
47 |     this.container = document.createElement("div");
48 |     this.container.style.display = "flex";
49 |     this.container.style.flexWrap = "wrap";
50 |     this.container.style.alignItems = "center";
51 |     this.shadowRoot.appendChild(this.container);
52 | 
53 |     // Listen for state changes
54 |     document.addEventListener("state-change", (e) => {
55 |       if (e.detail.property === "callgraphData") {
56 |         this.setState({ data: e.detail.value });
57 |       }
58 |     });
59 |   }
60 | 
61 |   render() {
62 |     const data = this.state.data || {};
63 | 
64 |     this.container.innerHTML = `
65 |       <div class="metric">
66 |         <span class="label">Duration</span>
67 |         <span class="value">${formatters.duration(data.total_duration)}</span>
68 |       </div>
69 |       <div class="metric">
70 |         <span class="label">Total Tokens</span>
71 |         <span class="value">${formatters.tokens(data.total_tokens)}</span>
72 |         <span class="cached">${data.total_tokens ? formatters.cachePercent(data.num_cached_tokens, data.total_tokens) : "-"}</span>
73 |       </div>
74 |       <div class="metric">
75 |         <span class="label">Cost</span>
76 |         <span class="value">${formatters.cost(data.total_cost)}</span>
77 |       </div>
78 |     `;
79 |   }
80 | }
81 | 
82 | customElements.define("metrics-display", MetricsDisplay);
83 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/static/core.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Core reactive system for the visualization
 3 |  */
 4 | 
 5 | // Base Component class
 6 | export class Component extends HTMLElement {
 7 |   constructor() {
 8 |     super();
 9 |     this.state = new Proxy(
10 |       {},
11 |       {
12 |         set: (target, property, value) => {
13 |           target[property] = value;
14 |           this.render();
15 |           return true;
16 |         },
17 |       },
18 |     );
19 |   }
20 | 
21 |   setState(newState) {
22 |     Object.assign(this.state, newState);
23 |   }
24 | 
25 |   render() {
26 |     // Override in subclasses
27 |   }
28 | 
29 |   connectedCallback() {
30 |     this.render();
31 |   }
32 | }
33 | 
34 | // HTML escaping utility
35 | export function escapeHtml(unsafe) {
36 |   return unsafe
37 |     .replace(/&/g, "&amp;")
38 |     .replace(/</g, "&lt;")
39 |     .replace(/>/g, "&gt;")
40 |     .replace(/"/g, "&quot;")
41 |     .replace(/'/g, "&#039;");
42 | }
43 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/static/store.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Centralized state management with WebSocket support
 3 |  */
 4 | 
 5 | export const store = new Proxy(
 6 |   {
 7 |     callgraphData: null,
 8 |   },
 9 |   {
10 |     set(target, property, value) {
11 |       target[property] = value;
12 |       document.dispatchEvent(
13 |         new CustomEvent("state-change", {
14 |           detail: { property, value },
15 |         })
16 |       );
17 |       return true;
18 |     },
19 |   }
20 | );
21 | 
22 | let socket;
23 | 
24 | export async function updateVisualization() {
25 |   try {
26 |     const response = await fetch("/api/callgraph");
27 |     const data = await response.json();
28 | 
29 |     // Skip if data hasn't changed
30 |     if (JSON.stringify(data) !== JSON.stringify(store.callgraphData)) {
31 |       store.callgraphData = data;
32 |     }
33 |   } catch (error) {
34 |     console.error("Error updating visualization:", error);
35 |   }
36 | }
37 | 
38 | function connectWebSocket() {
39 |   socket = new WebSocket(`ws://${window.location.host}/ws`);
40 | 
41 |   socket.onopen = () => {
42 |     console.log("WebSocket connected");
43 |   };
44 | 
45 |   socket.onmessage = (event) => {
46 |     const message = JSON.parse(event.data);
47 |     if (message.type === 'event') {
48 |       // Get latest data to incorporate the new event
49 |       updateVisualization();
50 |     }
51 |   };
52 | 
53 |   socket.onclose = () => {
54 |     console.log("WebSocket disconnected. Reconnecting...");
55 |     setTimeout(connectWebSocket, 1000);
56 |   };
57 | 
58 |   socket.onerror = (error) => {
59 |     console.error("WebSocket error:", error);
60 |   };
61 | }
62 | 
63 | // Start WebSocket connection and initial data load
64 | export function startUpdates() {
65 |   updateVisualization(); // Initial load
66 |   connectWebSocket();    // Real-time updates
67 | }


--------------------------------------------------------------------------------
/base_agent/src/web_server/static/styles.css:
--------------------------------------------------------------------------------
  1 | /* Base styles */
  2 | body {
  3 |   font-family: "Inter", sans-serif;
  4 |   color: #1f2937;
  5 | }
  6 | 
  7 | /* Global styles */
  8 | .execution-tree,
  9 | #event-stream {
 10 |   font-family: "JetBrains Mono", monospace;
 11 |   font-size: 13px;
 12 |   line-height: 1.3;
 13 | }
 14 | 
 15 | /* Header styles */
 16 | .header {
 17 |   background: linear-gradient(90deg, #1e293b 0%, #334155 100%);
 18 | }
 19 | 
 20 | /* Tree visualization styles */
 21 | .execution-tree {
 22 |   position: relative;
 23 | }
 24 | 
 25 | .execution-tree .node {
 26 |   margin-bottom: 0.25rem;
 27 |   position: relative;
 28 | }
 29 | 
 30 | .execution-tree .node-content {
 31 |   margin-left: 1.25rem;
 32 |   position: relative;
 33 | }
 34 | 
 35 | /* Vertical line for tree structure */
 36 | .execution-tree .node-content::before {
 37 |   content: "";
 38 |   position: absolute;
 39 |   left: -12px;
 40 |   top: 0;
 41 |   bottom: 0;
 42 |   width: 2px;
 43 |   background-color: #e2e8f0;
 44 | }
 45 | 
 46 | /* Hover effect for collapsible areas */
 47 | .execution-tree .node-content:hover::before {
 48 |   background-color: #93c5fd;
 49 | }
 50 | 
 51 | /* Reduce vertical space */
 52 | .execution-tree .event-entry,
 53 | #event-stream .event {
 54 |   padding-top: 0.125rem;
 55 |   padding-bottom: 0.125rem;
 56 | }
 57 | 
 58 | /* Event line styles */
 59 | .event-line {
 60 |   position: relative;
 61 | }
 62 | 
 63 | /* Event stream styles */
 64 | #event-stream .event {
 65 |   margin-bottom: 1rem;
 66 |   border-radius: 0.25rem;
 67 |   overflow: hidden;
 68 |   box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1);
 69 | }
 70 | 
 71 | #event-stream .event-content,
 72 | #event-stream .event-full-content {
 73 |   background-color: #f8fafc;
 74 |   transition: background-color 0.2s;
 75 | }
 76 | 
 77 | #event-stream .event-content:hover,
 78 | #event-stream .event-full-content:hover {
 79 |   background-color: #f1f5f9;
 80 | }
 81 | 
 82 | /* Execution tree hover styles */
 83 | .execution-tree .cursor-pointer {
 84 |   transition: background-color 0.2s;
 85 | }
 86 | 
 87 | .execution-tree .cursor-pointer:hover {
 88 |   background-color: #eff6ff;
 89 | }
 90 | 
 91 | /* Status indicators */
 92 | .status-indicator {
 93 |   display: inline-block;
 94 |   width: 10px;
 95 |   height: 10px;
 96 |   border-radius: 50%;
 97 |   margin-right: 0.5rem;
 98 | }
 99 | 
100 | .status-pending {
101 |   background-color: #fbbf24;
102 | }
103 | 
104 | .status-running {
105 |   background-color: #60a5fa;
106 |   animation: pulse 2s infinite;
107 | }
108 | 
109 | .status-success {
110 |   background-color: #34d399;
111 | }
112 | 
113 | .status-failed {
114 |   background-color: #f87171;
115 | }
116 | 
117 | @keyframes pulse {
118 |   0% {
119 |     opacity: 1;
120 |   }
121 |   50% {
122 |     opacity: 0.6;
123 |   }
124 |   100% {
125 |     opacity: 1;
126 |   }
127 | }
128 | 
129 | /* Animation */
130 | @keyframes highlight {
131 |   0% {
132 |     background-color: rgba(59, 130, 246, 0.1);
133 |   }
134 |   50% {
135 |     background-color: rgba(59, 130, 246, 0.1);
136 |   }
137 |   100% {
138 |     background-color: transparent;
139 |   }
140 | }
141 | 
142 | .event-highlight {
143 |   animation: highlight 2s ease-in-out;
144 | }
145 | 
146 | /* Scrollbar styles */
147 | ::-webkit-scrollbar {
148 |   width: 8px;
149 |   height: 8px;
150 | }
151 | 
152 | ::-webkit-scrollbar-track {
153 |   background: #f1f5f9;
154 |   border-radius: 4px;
155 | }
156 | 
157 | ::-webkit-scrollbar-thumb {
158 |   background: #cbd5e1;
159 |   border-radius: 4px;
160 | }
161 | 
162 | ::-webkit-scrollbar-thumb:hover {
163 |   background: #94a3b8;
164 | }
165 | 
166 | /* Utility classes */
167 | .truncate {
168 |   white-space: nowrap;
169 |   overflow: hidden;
170 |   text-overflow: ellipsis;
171 | }
172 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/static/utils/event-utils.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Event-related utilities
  3 |  */
  4 | 
  5 | // Event type mapping for badges
  6 | export function getEventBadge(type) {
  7 |   const badges = {
  8 |     assistant_message: "assistant",
  9 |     tool_call: "tool",
 10 |     tool_result: "tool",
 11 |     agent_call: "agent",
 12 |     agent_result: "agent",
 13 |     overseer_notification: "overseer",
 14 |     system_prompt_update: "system",
 15 |     core_prompt_update: "system",
 16 |     default: "system",
 17 |   };
 18 |   return badges[type] || badges.default;
 19 | }
 20 | 
 21 | // Get node status indicator
 22 | export function getStatusIndicator(node) {
 23 |   if (!node.started_at) {
 24 |     return { class: "status-pending", label: "Pending" };
 25 |   }
 26 |   if (!node.completed_at) {
 27 |     return { class: "status-running", label: "Running" };
 28 |   }
 29 |   return node.success
 30 |     ? { class: "status-success", label: "Success" }
 31 |     : { class: "status-failed", label: "Failed" };
 32 | }
 33 | 
 34 | // Creates a chronological event stream from all events across all nodes
 35 | export function createChronologicalEventStream(nodes) {
 36 |   const allEvents = [];
 37 |   Object.entries(nodes).forEach(([nodeId, node]) => {
 38 |     if (node.events) {
 39 |       allEvents.push(
 40 |         ...node.events.map((event) => ({
 41 |           nodeId,
 42 |           nodeName: node.name,
 43 |           event,
 44 |           time: new Date(event.timestamp),
 45 |         })),
 46 |       );
 47 |     }
 48 |   });
 49 |   return allEvents.sort((a, b) => a.time - b.time);
 50 | }
 51 | 
 52 | // Sort events while maintaining agent call sequence
 53 | export function sortNodeEvents(events) {
 54 |   const sortedEvents = [];
 55 |   const tempEvents = [...events].sort(
 56 |     (a, b) => new Date(a.timestamp) - new Date(b.timestamp),
 57 |   );
 58 | 
 59 |   let i = 0;
 60 |   while (i < tempEvents.length) {
 61 |     const event = tempEvents[i];
 62 |     sortedEvents.push(event);
 63 |     i++;
 64 | 
 65 |     if (event.type === "agent_call") {
 66 |       const callTime = new Date(event.timestamp);
 67 |       const agentEvents = [];
 68 |       let j = i;
 69 |       let foundResult = false;
 70 |       while (j < tempEvents.length && !foundResult) {
 71 |         const nextEvent = tempEvents[j];
 72 |         if (
 73 |           nextEvent.type === "agent_result" &&
 74 |           new Date(nextEvent.timestamp) > callTime
 75 |         ) {
 76 |           agentEvents.push(nextEvent);
 77 |           tempEvents.splice(j, 1);
 78 |           foundResult = true;
 79 |           continue;
 80 |         }
 81 |         tempEvents.splice(j, 1);
 82 |         agentEvents.push(nextEvent);
 83 |       }
 84 |       sortedEvents.push(...agentEvents);
 85 |     }
 86 |   }
 87 | 
 88 |   return sortedEvents;
 89 | }
 90 | 
 91 | // UI interaction functions
 92 | export function toggleContent(index) {
 93 |   // Get event-stream component
 94 |   const eventStream = document.querySelector("event-stream");
 95 |   if (eventStream && eventStream.shadowRoot) {
 96 |     const truncated = eventStream.shadowRoot.querySelector(
 97 |       `#event-${index} .event-content`,
 98 |     );
 99 |     const full = eventStream.shadowRoot.querySelector(`#event-full-${index}`);
100 |     if (truncated && full) {
101 |       if (truncated.classList.contains("hidden")) {
102 |         truncated.classList.remove("hidden");
103 |         full.classList.add("hidden");
104 |       } else {
105 |         truncated.classList.add("hidden");
106 |         full.classList.remove("hidden");
107 |       }
108 |     }
109 |   }
110 | }
111 | 
112 | export function scrollToTop() {
113 |   window.scrollTo({ top: 0, behavior: "smooth" });
114 | }
115 | 
116 | export function scrollToStreamEvent(index) {
117 |   // Get event-stream component
118 |   const eventStream = document.querySelector("event-stream");
119 |   if (eventStream && eventStream.shadowRoot) {
120 |     const streamEvent = eventStream.shadowRoot.querySelector(`#event-${index}`);
121 |     if (streamEvent) {
122 |       streamEvent.scrollIntoView({ behavior: "smooth", block: "center" });
123 |       streamEvent.classList.add("event-highlight");
124 |       setTimeout(() => streamEvent.classList.remove("event-highlight"), 2000);
125 | 
126 |       // Expand the event details if needed
127 |       const truncated = streamEvent.querySelector(`.event-content`);
128 |       const full = streamEvent.querySelector(`#event-full-${index}`);
129 |       if (truncated && full && truncated.classList.contains("hidden")) {
130 |         toggleContent(index);
131 |       }
132 |     }
133 |   }
134 | }
135 | 
136 | export function toggleNode(nodeId) {
137 |   const content = document.querySelector(`#${nodeId}-content`);
138 |   if (content) {
139 |     content.classList.toggle("hidden");
140 |   }
141 | }
142 | 
143 | // Expose required functions to window object for global access
144 | window.scrollToStreamEvent = scrollToStreamEvent;
145 | window.scrollToTop = scrollToTop;
146 | window.toggleContent = toggleContent;
147 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/static/utils/formatters.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Formatting utilities
 3 |  */
 4 | 
 5 | export const formatters = {
 6 |   duration: (s) =>
 7 |     !s
 8 |       ? "0s"
 9 |       : s < 60
10 |         ? `${s.toFixed(1)}s`
11 |         : `${Math.floor(s / 60)}m ${(s % 60).toFixed(1)}s`,
12 |   tokens: (t) =>
13 |     !t
14 |       ? "0"
15 |       : t < 1000
16 |         ? `${t}`
17 |         : t < 1000000
18 |           ? `${(t / 1000).toFixed(1)}K`
19 |           : `${(t / 1000000).toFixed(1)}M`,
20 |   cost: (c) =>
21 |     !c
22 |       ? "$0.00"
23 |       : c < 0.01
24 |         ? `$${c.toFixed(5)}`
25 |         : c < 0.1
26 |           ? `$${c.toFixed(4)}`
27 |           : c < 1
28 |             ? `$${c.toFixed(3)}`
29 |             : `$${c.toFixed(2)}`,
30 |   cachePercent: (cached, total) =>
31 |     !total ? "0%" : `${((cached / total) * 100).toFixed(1)}% cached`,
32 | };
33 | 
34 | // Get total tokens from usage object
35 | export function getTotalTokens(usage) {
36 |   if (!usage) return 0;
37 |   return (
38 |     (usage.uncached_prompt_tokens || 0) +
39 |     (usage.cache_write_prompt_tokens || 0) +
40 |     (usage.cached_prompt_tokens || 0) +
41 |     (usage.completion_tokens || 0)
42 |   );
43 | }
44 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/static/visualizer.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Main visualization entry point
 3 |  */
 4 | 
 5 | import { startUpdates } from "./store.js";
 6 | import "./components/execution-tree.js";
 7 | import "./components/event-stream.js";
 8 | import "./components/metrics-display.js";
 9 | import {
10 |   toggleContent,
11 |   toggleNode,
12 |   scrollToTop,
13 |   scrollToStreamEvent,
14 | } from "./utils/event-utils.js";
15 | 
16 | // Make UI functions globally available
17 | window.toggleContent = toggleContent;
18 | window.toggleNode = toggleNode;
19 | window.scrollToTop = scrollToTop;
20 | window.scrollToStreamEvent = scrollToStreamEvent;
21 | 
22 | // Start updates
23 | startUpdates();
24 | 


--------------------------------------------------------------------------------
/base_agent/src/web_server/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html class="h-full">
 3 |   <head>
 4 |     <title>Agent Execution</title>
 5 |     <script src="https://cdn.tailwindcss.com"></script>
 6 |     <link
 7 |       href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&family=Inter:wght@400;500;600&display=swap"
 8 |       rel="stylesheet"
 9 |     />
10 |     <link href="/static/styles.css" rel="stylesheet" />
11 |     <script src="/static/visualizer.js" type="module"></script>
12 |   </head>
13 |   <body class="h-full bg-gray-50">
14 |     <div class="min-h-full">
15 |       <!-- Header with Metrics -->
16 |       <div class="sticky top-0 z-10 header shadow-lg">
17 |         <div class="mx-auto px-4 sm:px-6 lg:px-8 py-4">
18 |           <div
19 |             class="flex flex-col sm:flex-row items-start sm:items-center justify-between gap-3"
20 |           >
21 |             <h1 class="text-xl font-semibold text-white">Agent Execution</h1>
22 |             <metrics-display></metrics-display>
23 |           </div>
24 |         </div>
25 |       </div>
26 | 
27 |       <!-- Main Content -->
28 |       <div class="py-6">
29 |         <div class="mx-auto px-4 sm:px-6 lg:px-8 max-w-7xl space-y-6">
30 |           <!-- Execution Tree -->
31 |           <div class="bg-white shadow rounded-lg overflow-hidden">
32 |             <div class="px-4 py-3 border-b border-gray-200 bg-gray-50">
33 |               <h2 class="text-lg font-medium text-gray-900">Execution Tree</h2>
34 |             </div>
35 |             <execution-tree class="block p-4"></execution-tree>
36 |           </div>
37 | 
38 |           <!-- Event Stream Log -->
39 |           <div class="bg-white shadow rounded-lg overflow-hidden">
40 |             <div class="px-4 py-3 border-b border-gray-200 bg-gray-50">
41 |               <h2 class="text-lg font-medium text-gray-900">Event Stream</h2>
42 |             </div>
43 |             <event-stream class="block divide-y divide-gray-100"></event-stream>
44 |           </div>
45 |         </div>
46 |       </div>
47 |     </div>
48 |   </body>
49 | </html>
50 | 


--------------------------------------------------------------------------------
/base_agent/tests/benchmarks/test_gsm8k_benchmark.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Tests for the GSM8K benchmark implementation."""
  7 | import pytest
  8 | import tempfile
  9 | import os
 10 | from pathlib import Path
 11 | from unittest.mock import patch, MagicMock
 12 | 
 13 | from src.benchmarks.gsm8k import GSM8KBenchmark, GSM8KExample
 14 | 
 15 | 
 16 | class TestGSM8KExample:
 17 |     """Tests for the GSM8KExample class."""
 18 | 
 19 |     def test_from_raw(self):
 20 |         """Test conversion from raw dataset example."""
 21 |         # Create a mock raw example
 22 |         raw_example = {
 23 |             "question": "John has 5 apples. He buys 2 more. How many does he have now?",
 24 |             "answer": "John has 5 apples initially.\nHe buys 2 more apples.\nSo he has 5 + 2 = <<5+2=7>> apples in total.\n#### 7"
 25 |         }
 26 | 
 27 |         example = GSM8KExample.from_raw(raw_example)
 28 | 
 29 |         assert example.answer == raw_example["answer"]
 30 |         assert example.steps == [
 31 |             "John has 5 apples initially.",
 32 |             "He buys 2 more apples.",
 33 |             "So he has 5 + 2 = <<5+2=7>> apples in total."
 34 |         ]
 35 |         assert example.final_answer == 7.0
 36 | 
 37 |     def test_extract_calculations(self):
 38 |         """Test extraction of calculations from solution steps."""
 39 |         raw_example = {
 40 |             "question": "Calculation test",
 41 |             "answer": "Step 1: Calculate 2 + 3 = <<2+3=5>>\nStep 2: Multiply by 4: 5 × 4 = <<5*4=20>>\n#### 20"
 42 |         }
 43 | 
 44 |         example = GSM8KExample.from_raw(raw_example)
 45 |         calculations = example.extract_calculations()
 46 | 
 47 |         assert len(calculations) == 2
 48 | 
 49 |         # First calculation
 50 |         expr1, expected1, actual1 = calculations[0]
 51 |         assert expr1 == "2+3"
 52 |         assert expected1 == 5
 53 |         assert actual1 == 5
 54 | 
 55 |         # Second calculation
 56 |         expr2, expected2, actual2 = calculations[1]
 57 |         assert expr2 == "5*4"
 58 |         assert expected2 == 20
 59 |         assert actual2 == 20
 60 | 
 61 | 
 62 | @pytest.mark.parametrize("subset_size", [None, 5, 10])
 63 | def test_benchmark_initialization(subset_size):
 64 |     """Test initializing the GSM8K benchmark with various subset sizes."""
 65 |     with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
 66 |         # Mock the dataset loading
 67 |         mock_dataset = MagicMock()
 68 |         mock_dataset.__getitem__.return_value = [
 69 |             {"question": f"Question {i}", "answer": f"Some steps\n#### {i}"}
 70 |             for i in range(1, 21)  # Create 20 mock examples
 71 |         ]
 72 |         mock_load_dataset.return_value = mock_dataset
 73 | 
 74 |         benchmark = GSM8KBenchmark(seed=42, subset_size=subset_size)
 75 | 
 76 |         # Check benchmark properties
 77 |         assert benchmark.name == "gsm8k"
 78 | 
 79 |         # Verify subset_size is respected
 80 |         if subset_size:
 81 |             assert len(benchmark.problems) == subset_size
 82 |         else:
 83 |             assert len(benchmark.problems) == 20  # All examples
 84 | 
 85 |         # Verify problems have the expected structure
 86 |         for problem in benchmark.problems:
 87 |             assert isinstance(problem.statement, str)
 88 |             assert isinstance(problem.problem_id, str)  # Just check it's a string
 89 |             assert isinstance(problem.answer, float)
 90 |             assert isinstance(problem.answer_discussion, str)
 91 | 
 92 | 
 93 | @pytest.mark.asyncio
 94 | async def test_score_problem_correct():
 95 |     """Test scoring a correct GSM8K answer."""
 96 |     with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
 97 |         # Mock the dataset loading
 98 |         mock_dataset = MagicMock()
 99 |         mock_dataset.__getitem__.return_value = [
100 |             {"question": "Question 1", "answer": "Some steps\n#### 42"}
101 |         ]
102 |         mock_load_dataset.return_value = mock_dataset
103 | 
104 |         benchmark = GSM8KBenchmark(seed=42, subset_size=1)
105 |         problem = benchmark.problems[0]
106 | 
107 |         # Create a temporary directory for the answer
108 |         with tempfile.TemporaryDirectory() as tmp_dir:
109 |             answer_dir = Path(tmp_dir)
110 | 
111 |             # Create answer.txt with the correct answer
112 |             answer_file = answer_dir / "answer.txt"
113 |             answer_file.write_text("42")
114 | 
115 |             # Score the answer
116 |             score, errors, discussion = await benchmark.score_problem(
117 |                 problem=problem,
118 |                 agent_workdir="/fake/workdir",
119 |                 agent_answer_dir=str(answer_dir),
120 |                 container_name="fake_container"
121 |             )
122 | 
123 |             # Verify the scoring
124 |             assert score == 1.0
125 |             assert errors is None
126 |             assert discussion is not None
127 | 
128 | 
129 | @pytest.mark.asyncio
130 | async def test_score_problem_incorrect():
131 |     """Test scoring an incorrect GSM8K answer."""
132 |     with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
133 |         # Mock the dataset loading
134 |         mock_dataset = MagicMock()
135 |         mock_dataset.__getitem__.return_value = [
136 |             {"question": "Question 1", "answer": "Some steps\n#### 42"}
137 |         ]
138 |         mock_load_dataset.return_value = mock_dataset
139 | 
140 |         benchmark = GSM8KBenchmark(seed=42, subset_size=1)
141 |         problem = benchmark.problems[0]
142 | 
143 |         # Create a temporary directory for the answer
144 |         with tempfile.TemporaryDirectory() as tmp_dir:
145 |             answer_dir = Path(tmp_dir)
146 | 
147 |             # Create answer.txt with an incorrect answer
148 |             answer_file = answer_dir / "answer.txt"
149 |             answer_file.write_text("43")
150 | 
151 |             # Score the answer
152 |             score, errors, discussion = await benchmark.score_problem(
153 |                 problem=problem,
154 |                 agent_workdir="/fake/workdir",
155 |                 agent_answer_dir=str(answer_dir),
156 |                 container_name="fake_container"
157 |             )
158 | 
159 |             # Verify the scoring
160 |             assert score == 0.0
161 |             assert errors is None
162 |             assert discussion is not None
163 | 
164 | 
165 | @pytest.mark.asyncio
166 | async def test_score_problem_invalid_format():
167 |     """Test scoring a GSM8K answer with invalid format."""
168 |     with patch("src.benchmarks.gsm8k.load_dataset") as mock_load_dataset:
169 |         # Mock the dataset loading
170 |         mock_dataset = MagicMock()
171 |         mock_dataset.__getitem__.return_value = [
172 |             {"question": "Question 1", "answer": "Some steps\n#### 42"}
173 |         ]
174 |         mock_load_dataset.return_value = mock_dataset
175 | 
176 |         benchmark = GSM8KBenchmark(seed=42, subset_size=1)
177 |         problem = benchmark.problems[0]
178 | 
179 |         # Create a temporary directory for the answer
180 |         with tempfile.TemporaryDirectory() as tmp_dir:
181 |             answer_dir = Path(tmp_dir)
182 | 
183 |             # Create answer.txt with an incorrectly formatted answer
184 |             answer_file = answer_dir / "answer.txt"
185 |             answer_file.write_text("The answer is forty-two")
186 | 
187 |             # Score the answer
188 |             score, errors, discussion = await benchmark.score_problem(
189 |                 problem=problem,
190 |                 agent_workdir="/fake/workdir",
191 |                 agent_answer_dir=str(answer_dir),
192 |                 container_name="fake_container"
193 |             )
194 | 
195 |             # Verify the scoring
196 |             assert score == 0.0
197 |             assert errors is not None  # Should have parsing errors
198 |             assert "could not convert string to float" in errors.lower() or "invalid literal" in errors.lower()
199 | 


--------------------------------------------------------------------------------
/base_agent/tests/test_example.py:
--------------------------------------------------------------------------------
 1 | # Self-Improving Coding Agent
 2 | # Copyright (c) 2025 Maxime Robeyns
 3 | #
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | from src.types.llm_types import Model
 7 | 
 8 | def test_example():
 9 |     assert True
10 |     assert isinstance(Model.SONNET_35.id, str)
11 | 


--------------------------------------------------------------------------------
/base_agent/tests/tools/reasoning_structures/test_sequential.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Tests for the Sequential reasoning structure tool."""
  7 | import pytest
  8 | from unittest.mock import patch, AsyncMock
  9 | 
 10 | from src.tools.reasoning_structures.sequential import (
 11 |     ToolBasedReasoningStructure, Step, InvocationState, _make_id
 12 | )
 13 | from src.agents.implementations import DemoAgent
 14 | from src.types.tool_types import ToolResult
 15 | 
 16 | 
 17 | # Do not use global pytestmark
 18 | # Apply asyncio marker only to functions that need it
 19 | @pytest.mark.asyncio
 20 | async def test_initialization():
 21 |     """Test proper initialization of the reasoning structure."""
 22 |     structure = ToolBasedReasoningStructure(calling_agent=DemoAgent())
 23 |     
 24 |     # Verify basic properties
 25 |     assert structure.TOOL_NAME == "example_reasoning_structure"
 26 |     assert hasattr(structure, "_steps")
 27 |     assert len(structure._steps) > 0
 28 |     assert all(isinstance(step, Step) for step in structure._steps)
 29 | 
 30 | 
 31 | @pytest.mark.asyncio
 32 | async def test_run_initializes_state():
 33 |     """Test that run() correctly initializes state."""
 34 |     agent = DemoAgent()
 35 |     structure = ToolBasedReasoningStructure(calling_agent=agent)
 36 |     
 37 |     # Run the reasoning structure
 38 |     result = await structure.run()
 39 |     
 40 |     # Verify state initialization
 41 |     assert len(agent._local_state) == 1
 42 |     
 43 |     invocation_id = next(iter(agent._local_state.keys()))
 44 |     invocation = agent._local_state[invocation_id]
 45 |     
 46 |     assert isinstance(invocation, InvocationState)
 47 |     assert invocation.steps == structure._steps
 48 |     assert invocation.current_step_id == structure._steps[0].identifier
 49 |     assert invocation.current_step_complete_tool is not None
 50 | 
 51 | 
 52 | @pytest.mark.asyncio
 53 | async def test_run_registers_completion_tool():
 54 |     """Test that run() registers a completion tool for the first step."""
 55 |     # Create an empty mock registry
 56 |     mock_registry = {}
 57 |     
 58 |     # Apply the patch within the test
 59 |     with patch("src.tools.reasoning_structures.sequential.tool_registry", mock_registry):
 60 |         agent = DemoAgent()
 61 |         structure = ToolBasedReasoningStructure(calling_agent=agent)
 62 |         
 63 |         # Run the reasoning structure
 64 |         await structure.run()
 65 |         
 66 |         # Verify a tool was registered
 67 |         assert len(mock_registry) == 1
 68 |         
 69 |         # Get the registered tool
 70 |         tool_name = next(iter(mock_registry.keys()))
 71 |         
 72 |         # Verify it's a completion tool
 73 |         assert tool_name.endswith("_complete")
 74 |         assert mock_registry[tool_name] in agent._available_tools
 75 | 
 76 | 
 77 | @pytest.mark.asyncio
 78 | async def test_run_returns_correct_result():
 79 |     """Test that run() returns the expected result structure."""
 80 |     structure = ToolBasedReasoningStructure(calling_agent=DemoAgent())
 81 |     
 82 |     # Run the reasoning structure
 83 |     result = await structure.run()
 84 |     
 85 |     # Verify result properties
 86 |     assert isinstance(result, ToolResult)
 87 |     assert result.tool_name == structure.TOOL_NAME
 88 |     assert result.success is True
 89 |     assert "step id" in result.output.lower()
 90 |     assert "step instructions" in result.output.lower()
 91 | 
 92 | 
 93 | @pytest.mark.asyncio
 94 | async def test_step_completion_tool_creation():
 95 |     """Test the creation of step completion tools."""
 96 |     # Setup a mock for create_step_tool
 97 |     with patch("src.tools.reasoning_structures.sequential.create_step_tool") as mock_create_step_tool:
 98 |         # Setup mock return value
 99 |         mock_tool_cls = AsyncMock()
100 |         mock_create_step_tool.return_value = mock_tool_cls
101 |         
102 |         # Create and run structure
103 |         structure = ToolBasedReasoningStructure(calling_agent=DemoAgent())
104 |         await structure.run()
105 |         
106 |         # Verify tool creation
107 |         mock_create_step_tool.assert_called_once()
108 |         
109 |         # Check arguments
110 |         args = mock_create_step_tool.call_args[0]
111 |         assert isinstance(args[0], str)  # invocation_id
112 |         assert isinstance(args[1], Step)  # step
113 | 
114 | # No asyncio marker for this function since it's synchronous
115 | def test_step_creation_utility():
116 |     """Test the utility function for creating step identifiers."""
117 |     # Generate IDs with custom prefix
118 |     ids = [_make_id("test_prefix") for _ in range(5)]
119 |     
120 |     # Verify uniqueness
121 |     assert len(ids) == len(set(ids))
122 |     
123 |     # Verify format
124 |     for id in ids:
125 |         assert id.startswith("test_prefix_")
126 |         assert len(id) > len("test_prefix_")
127 |     
128 |     # Verify default prefix works
129 |     default_id = _make_id()
130 |     assert default_id.startswith("step_")
131 | 


--------------------------------------------------------------------------------
/base_agent/tests/tools/test_base_tool.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """Tests for the BaseTool class functionality."""
  7 | import pytest
  8 | from unittest.mock import Mock, patch
  9 | import asyncio
 10 | from typing import Optional
 11 | 
 12 | # Fix the import paths to work when running from the base_agent directory
 13 | from src.tools.base_tool import BaseTool, tool_registry
 14 | from src.types.tool_types import ToolResult
 15 | from src.types.agent_types import AgentInterface
 16 | from src.types.common import ArgFormat
 17 | 
 18 | class TestBaseTool:
 19 |     """Test suite for BaseTool class."""
 20 | 
 21 |     def setup_method(self):
 22 |         """Setup for each test method."""
 23 |         # Save the original registry and clear it for testing
 24 |         self.original_registry = dict(tool_registry)
 25 |         tool_registry.clear()
 26 | 
 27 |     def teardown_method(self):
 28 |         """Teardown after each test method."""
 29 |         # Restore the original registry after each test
 30 |         tool_registry.clear()
 31 |         tool_registry.update(self.original_registry)
 32 | 
 33 |     def test_tool_registration(self):
 34 |         """Test that tools are properly registered through metaclass."""
 35 |         # Define a test tool class
 36 |         class TestTool(BaseTool):
 37 |             TOOL_NAME = "test_tool"
 38 |             TOOL_DESCRIPTION = "A test tool for registration"
 39 | 
 40 |             async def run(self) -> ToolResult:
 41 |                 return ToolResult(tool_name=self.TOOL_NAME, success=True)
 42 | 
 43 |             @classmethod
 44 |             def generate_examples(cls):
 45 |                 return []
 46 | 
 47 |         # Verify the tool was registered correctly
 48 |         assert "test_tool" in tool_registry
 49 |         assert tool_registry["test_tool"] == TestTool
 50 | 
 51 |     @pytest.mark.asyncio
 52 |     async def test_tool_examples(self):
 53 |         """Test that generate_examples returns valid examples."""
 54 |         # Define a test tool with examples
 55 |         class ExampleTool(BaseTool):
 56 |             TOOL_NAME = "example_tool"
 57 |             TOOL_DESCRIPTION = "Test tool with examples"
 58 | 
 59 |             async def run(self) -> ToolResult:
 60 |                 return ToolResult(tool_name=self.TOOL_NAME, success=True)
 61 | 
 62 |             @classmethod
 63 |             def generate_examples(cls):
 64 |                 # Return a minimal valid example
 65 |                 mock_agent = Mock(spec=AgentInterface)
 66 |                 tool_instance = cls(calling_agent=mock_agent)
 67 |                 tool_result = ToolResult(tool_name=cls.TOOL_NAME, success=True)
 68 |                 return [(tool_instance, tool_result)]
 69 | 
 70 |         # Check examples format
 71 |         examples = ExampleTool.generate_examples()
 72 | 
 73 |         assert isinstance(examples, list)
 74 |         assert len(examples) == 1
 75 |         example = examples[0]
 76 |         assert isinstance(example, tuple)
 77 |         assert len(example) == 2
 78 |         assert isinstance(example[0], ExampleTool)
 79 |         assert isinstance(example[1], ToolResult)
 80 | 
 81 |     @pytest.mark.asyncio
 82 |     async def test_args_str_to_dict(self):
 83 |         """Test XML and JSON argument parsing."""
 84 |         from pydantic import Field
 85 | 
 86 |         class ArgTool(BaseTool):
 87 |             TOOL_NAME = "arg_tool"
 88 |             TOOL_DESCRIPTION = "Test tool with arguments"
 89 | 
 90 |             arg1: str = Field(..., description="Test argument")
 91 |             arg2: int = Field(default=0, description="Optional argument")
 92 | 
 93 |             async def run(self) -> ToolResult:
 94 |                 return ToolResult(tool_name=self.TOOL_NAME, success=True)
 95 | 
 96 |             @classmethod
 97 |             def generate_examples(cls):
 98 |                 return []
 99 | 
100 |         # Test XML parsing
101 |         xml_args = """
102 |         <TOOL_ARGS>
103 |             <arg1>test</arg1>
104 |             <arg2>42</arg2>
105 |         </TOOL_ARGS>
106 |         """
107 |         args_dict, warnings = await ArgTool.args_str_to_dict(xml_args, ArgFormat.XML)
108 |         assert args_dict is not None
109 |         assert args_dict["arg1"] == "test"
110 |         assert args_dict["arg2"] == 42
111 |         assert warnings is None
112 | 
113 |         # Test bad XML - this should result in a warning and possibly a None args_dict
114 |         # or a dict with only default values, depending on the implementation
115 |         bad_xml = "<TOOL_ARGS><arg1>test</TOOL_ARGS>"
116 |         args_dict, warnings = await ArgTool.args_str_to_dict(bad_xml, ArgFormat.XML)
117 |         # The important thing is that a warning is generated
118 |         assert warnings is not None
119 | 
120 |         # We don't make assumptions about whether args_dict is None or partially populated
121 |         # as implementation details can vary. If it's None, the test passes.
122 |         # If not None, check that it doesn't contain the required field or that it does have defaults.
123 |         if args_dict is not None:
124 |             # It might contain default values but not the required field
125 |             assert "arg1" not in args_dict, "Required field should not be present in malformed XML"
126 |             # Optionally check if default values are preserved
127 |             # We don't assert this as it's an implementation detail that could change
128 |             # assert args_dict.get("arg2") == 0, "Default value should be present"
129 | 
130 |     @pytest.mark.asyncio
131 |     async def test_tool_result_formatting(self):
132 |         """Test that tool results are properly formatted."""
133 |         # Create a mock agent for testing
134 |         mock_agent = Mock(spec=AgentInterface)
135 | 
136 |         # Define a simple test tool
137 |         class ResultTool(BaseTool):
138 |             TOOL_NAME = "result_tool"
139 |             TOOL_DESCRIPTION = "Test tool for result formatting"
140 | 
141 |             async def run(self) -> ToolResult:
142 |                 return ToolResult(
143 |                     tool_name=self.TOOL_NAME,
144 |                     success=True,
145 |                     output="test output",
146 |                     warnings="test warning",
147 |                     errors=None
148 |                 )
149 | 
150 |             @classmethod
151 |             def generate_examples(cls):
152 |                 return []
153 | 
154 |         # Test successful tool execution
155 |         tool = ResultTool(calling_agent=mock_agent)
156 |         result = await tool.run()
157 | 
158 |         # Check result structure
159 |         assert isinstance(result, ToolResult)
160 |         assert result.tool_name == "result_tool"
161 |         assert result.success is True
162 |         assert "test output" in str(result)
163 |         assert "test warning" in str(result)
164 | 
165 |         # Test failure result formatting
166 |         failure_result = ToolResult(
167 |             tool_name="fail_tool",
168 |             success=False,
169 |             output=None,
170 |             warnings=None,
171 |             errors="test error"
172 |         )
173 | 
174 |         # Check failure result structure
175 |         assert isinstance(failure_result, ToolResult)
176 |         assert failure_result.tool_name == "fail_tool"
177 |         assert failure_result.success is False
178 |         assert "test error" in str(failure_result)
179 |         assert "SUCCESS" not in str(failure_result)
180 |         assert "FAILURE" in str(failure_result)
181 | 
182 | if __name__ == "__main__":
183 |     # Run the tests directly for debugging
184 |     pytest.main(["-xvs", __file__])
185 | 


--------------------------------------------------------------------------------
/base_agent/tests/utils/test_parsing.py:
--------------------------------------------------------------------------------
  1 | # Self-Improving Coding Agent
  2 | # Copyright (c) 2025 Maxime Robeyns
  3 | #
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | Tests for the parsing utilities module.
  8 | """
  9 | import re
 10 | import pytest
 11 | from src.utils.parsing import (
 12 |     extract_before_last,
 13 |     extract_after_last,
 14 |     extract_after_first,
 15 |     extract_between_patterns,
 16 |     parse_number_from_string,
 17 | )
 18 | 
 19 | 
 20 | # Test extract_before_last
 21 | @pytest.mark.parametrize(
 22 |     "text, pattern, keep_pattern, expected",
 23 |     [
 24 |         ("hello world hello", "hello", False, "hello world "),  # Basic case
 25 |         ("hello world hello", "hello", True, "hello world hello"),  # Keep pattern
 26 |         ("no pattern here", "xyz", False, ""),  # Pattern not found
 27 |         ("", "hello", False, ""),  # Empty string
 28 |         ("hello", "hello", False, ""),  # Pattern at end
 29 |     ],
 30 |     ids=["basic", "keep_pattern", "not_found", "empty", "end_pattern"],
 31 | )
 32 | def test_extract_before_last(text, pattern, keep_pattern, expected):
 33 |     result = extract_before_last(text, pattern, keep_pattern)
 34 |     assert result == expected, f"Expected '{expected}', got '{result}'"
 35 | 
 36 | 
 37 | # Test extract_after_last
 38 | @pytest.mark.parametrize(
 39 |     "text, pattern, keep_pattern, expected",
 40 |     [
 41 |         ("hello world hello", "hello", False, ""),  # Last occurrence at end
 42 |         ("hello world hello", "hello", True, "hello"),  # Keep pattern
 43 |         ("hello world hello", "world", False, " hello"),  # Middle occurrence
 44 |         ("no pattern here", "xyz", False, ""),  # Pattern not found
 45 |         ("hello", "hello", True, "hello"),  # Single pattern
 46 |     ],
 47 |     ids=["end", "keep_pattern", "middle", "not_found", "single"],
 48 | )
 49 | def test_extract_after_last(text, pattern, keep_pattern, expected):
 50 |     result = extract_after_last(text, pattern, keep_pattern)
 51 |     assert result == expected
 52 | 
 53 | 
 54 | # Test extract_after_first
 55 | @pytest.mark.parametrize(
 56 |     "text, pattern, keep_pattern, expected",
 57 |     [
 58 |         ("hello world hello", "hello", False, " world hello"),  # First occurrence
 59 |         ("hello world hello", "hello", True, "hello world hello"),  # Keep pattern
 60 |         ("no pattern here", "xyz", False, ""),  # Pattern not found
 61 |         ("hello", "he", False, "llo"),  # Partial pattern
 62 |         ("", "xyz", False, ""),  # Empty string
 63 |     ],
 64 |     ids=["basic", "keep_pattern", "not_found", "partial", "empty"],
 65 | )
 66 | def test_extract_after_first(text, pattern, keep_pattern, expected):
 67 |     result = extract_after_first(text, pattern, keep_pattern)
 68 |     assert result == expected
 69 | 
 70 | 
 71 | # Test extract_between_patterns
 72 | @pytest.mark.parametrize(
 73 |     "text, pattern_a, pattern_b, a_occ, b_occ, expected",
 74 |     [
 75 |         # First/Last combinations
 76 |         ("start middle end", "start", "end", "first", "last", " middle "),
 77 |         ("a b a c a d", "a", "a", "first", "last", " b a c "),
 78 |         ("a b a c a d", "a", "a", "last", "first", None),  # Invalid range
 79 |         # Pattern not found
 80 |         ("hello world", "xyz", "abc", "first", "last", None),
 81 |         ("hello world", "hello", "xyz", "first", "last", None),
 82 |         # Edge cases
 83 |         ("", "a", "b", "first", "last", None),  # Empty string
 84 |         ("abc", "a", "c", "first", "last", "b"),  # Adjacent patterns
 85 |     ],
 86 |     ids=[
 87 |         "first_last",
 88 |         "multiple_a_last",
 89 |         "invalid_range",
 90 |         "a_missing",
 91 |         "b_missing",
 92 |         "empty",
 93 |         "adjacent",
 94 |     ],
 95 | )
 96 | def test_extract_between_patterns(text, pattern_a, pattern_b, a_occ, b_occ, expected):
 97 |     result = extract_between_patterns(text, pattern_a, pattern_b, a_occ, b_occ)
 98 |     assert result == expected
 99 | 
100 | 
101 | # Test extract_between_patterns with invalid occurrence values
102 | @pytest.mark.parametrize(
103 |     "a_occ, b_occ",
104 |     [("invalid", "first"), ("first", "invalid")],
105 |     ids=["invalid_a", "invalid_b"],
106 | )
107 | def test_extract_between_patterns_invalid_occurrence(a_occ, b_occ):
108 |     with pytest.raises(ValueError, match="Invalid value for.*occurrence"):
109 |         extract_between_patterns("text", "a", "b", a_occ, b_occ)
110 | 
111 | 
112 | # Fixture for parse_number_from_string tests
113 | @pytest.fixture
114 | def number_parser():
115 |     return parse_number_from_string
116 | 
117 | 
118 | # Test parse_number_from_string
119 | @pytest.mark.parametrize(
120 |     "input_str, expected",
121 |     [
122 |         # Successful cases
123 |         ("42", (True, 42.0, None)),
124 |         ("-3.14", (True, -3.14, None)),
125 |         ("1,234.56", (True, 1234.56, None)),  # Commas removed
126 |         ("  6.022e23  ", (True, 6.022e23, None)),  # Scientific notation
127 |         # Success with warning
128 |         ("42 extra text", (True, 42.0, "Warning: Found additional text.*")),
129 |         # Failure cases
130 |         ("no number here", (False, None, "Could not find a number.*")),
131 |         ("", (False, None, "Could not find a number.*")),
132 |     ],
133 |     ids=[
134 |         "integer",
135 |         "negative_float",
136 |         "comma_float",
137 |         "scientific",
138 |         "extra_text",
139 |         "no_number",
140 |         "empty",
141 |     ],
142 | )
143 | def test_parse_number_from_string(number_parser, input_str, expected):
144 |     success, value, message = number_parser(input_str)
145 |     assert success == expected[0]
146 |     assert value == expected[1]
147 |     if message is not None and expected[2] is not None:
148 |         assert re.match(expected[2], message)  # Match regex pattern for message
149 |     else:
150 |         assert message == expected[2]
151 | 
152 | 
153 | # Example of a slow test (for demonstration)
154 | @pytest.mark.slow
155 | def test_parse_number_from_string_slow(number_parser):
156 |     import time
157 |     time.sleep(1)  # Simulate slow operation
158 |     success, value, _ = number_parser("12345")
159 |     assert success and value == 12345.0
160 | 


--------------------------------------------------------------------------------
/benchmark_data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/benchmark_data/.gitkeep


--------------------------------------------------------------------------------
/figures/agent_execution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/figures/agent_execution.png


--------------------------------------------------------------------------------
/figures/agent_loop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/figures/agent_loop.png


--------------------------------------------------------------------------------
/results/interactive_output/agent_outputs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaximeRobeyns/self_improving_coding_agent/ed8275dca4d3c5dbf77229964351fe9b424797dc/results/interactive_output/agent_outputs/.gitkeep


--------------------------------------------------------------------------------
/sandbox/Dockerfile:
--------------------------------------------------------------------------------
  1 | # Based on Fedora
  2 | FROM fedora:42
  3 | 
  4 | # Accept TARGET_ARCH build argument
  5 | ARG TARGET_ARCH=x86_64
  6 | 
  7 | # Set up the environment variables
  8 | ENV SANDBOX_DIR=/home/agent \
  9 |     SHELL=/bin/bash \
 10 |     TZ=Etc/UTC \
 11 |     DEBIAN_FRONTEND=noninteractive \
 12 |     PATH=/opt/miniconda3/bin:$PATH
 13 | 
 14 | # Setup agent user with sudo access
 15 | RUN useradd -m -d /home/agent -s ${SHELL} agent && \
 16 |     echo "agent ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/agent && \
 17 |     echo "Defaults    env_keep += \"PATH\"" >> /etc/sudoers.d/agent && \
 18 |     chmod 0440 /etc/sudoers.d/agent
 19 | 
 20 | # Install common dev tools
 21 | RUN dnf -y install dnf-plugins-core && \
 22 |     dnf -y remove selinux-policy* && \
 23 |     dnf -y update && \
 24 |     dnf -y install \
 25 |     gcc gcc-c++ make git git-lfs llvm llvm-devel clang clang-devel \
 26 |     nodejs python3.12 python3.12-devel cmake openssh-server \
 27 |     tmux lsof strace gdb ltrace valgrind inotify-tools jq pv bzip2 unzip \
 28 |     p7zip wget curl sudo file tree which gettext-envsubst patch openssl \
 29 |     rsync zip nmap-ncat ripgrep perf poppler-utils lapack-devel blas-devel \
 30 |     openssl-devel libffi-devel procps-ng sysstat htop \
 31 |     libtiff-devel golang awk \
 32 |     # System and networking utilities
 33 |     hostname net-tools iproute iputils bind-utils tcpdump traceroute mtr \
 34 |     psmisc lsof netcat telnet whois tar gzip less findutils
 35 | 
 36 | # Install Miniconda in /opt and set permissions
 37 | USER root
 38 | RUN mkdir -p /opt/miniconda3 && \
 39 |     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${TARGET_ARCH}.sh -O /opt/miniconda3/miniconda.sh && \
 40 |     bash /opt/miniconda3/miniconda.sh -b -u -p /opt/miniconda3 && \
 41 |     rm /opt/miniconda3/miniconda.sh && \
 42 |     chown -R agent:agent /opt/miniconda3 && \
 43 |     chmod -R u+w /opt/miniconda3
 44 | 
 45 | 
 46 | # Configure conda
 47 | RUN /opt/miniconda3/bin/conda init --all && \
 48 |     /opt/miniconda3/bin/conda config --append channels conda-forge
 49 | 
 50 | # Create system-wide conda initialization
 51 | RUN echo '. /opt/miniconda3/etc/profile.d/conda.sh' >> /etc/bashrc && \
 52 |     echo 'source /opt/miniconda3/bin/activate' >> /etc/bashrc && \
 53 |     mkdir -p /etc/profile.d && \
 54 |     echo '. /opt/miniconda3/etc/profile.d/conda.sh' >> /etc/profile.d/conda.sh && \
 55 |     echo 'source /opt/miniconda3/bin/activate' >> /etc/profile.d/conda.sh && \
 56 |     chmod +x /etc/profile.d/conda.sh
 57 | 
 58 | # Switch back to root for system configurations
 59 | USER root
 60 | 
 61 | # Setup Python 3.12 as default python
 62 | RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 && \
 63 |     alternatives --set python3 /usr/bin/python3.12 && \
 64 |     alternatives --install /usr/bin/python python /usr/bin/python3 1
 65 | 
 66 | # Create necessary directories for pnpm
 67 | RUN mkdir -p ${SANDBOX_DIR}/.local/share/pnpm && \
 68 |     touch ${SANDBOX_DIR}/.bashrc && \
 69 |     chown -R agent:agent ${SANDBOX_DIR}/.local && \
 70 |     chown agent:agent ${SANDBOX_DIR}/.bashrc && \
 71 |     chown agent:agent ${SANDBOX_DIR}
 72 | 
 73 | COPY configs/gitignore ${SANDBOX_DIR}/.gitignore
 74 | RUN chown agent:agent ${SANDBOX_DIR}/.gitignore && \
 75 |     chmod +w ${SANDBOX_DIR}/.gitignore
 76 | 
 77 | # Switch back to agent user for remaining setup
 78 | USER agent
 79 | WORKDIR ${SANDBOX_DIR}
 80 | 
 81 | # Set directory permissions
 82 | RUN mkdir -p ${SANDBOX_DIR}/.ssh && \
 83 |     chmod 700 ${SANDBOX_DIR}/.ssh && \
 84 |     touch ${SANDBOX_DIR}/.ssh/authorized_keys && \
 85 |     chmod 600 ${SANDBOX_DIR}/.ssh/authorized_keys
 86 | 
 87 | RUN curl https://raw.githubusercontent.com/github/gitignore/main/Python.gitignore >> ${SANDBOX_DIR}/.gitignore && \
 88 |     curl https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore >> ${SANDBOX_DIR}/.gitignore
 89 | 
 90 | # Install and setup pnpm
 91 | ENV PNPM_HOME=${SANDBOX_DIR}/.local/share/pnpm
 92 | ENV PATH=$PNPM_HOME:$PATH
 93 | ENV NODE_OPTIONS=--max_old_space_size=4096
 94 | RUN curl -fsSL https://get.pnpm.io/install.sh | ENV="${SANDBOX_DIR}/.bashrc" SHELL="/bin/bash" bash - && \
 95 |     . ${SANDBOX_DIR}/.bashrc && \
 96 |     echo "export PNPM_HOME=$PNPM_HOME" >> ${SANDBOX_DIR}/.bashrc && \
 97 |     echo "export PATH=$PNPM_HOME:\$PATH" >> ${SANDBOX_DIR}/.bashrc && \
 98 |     . ${SANDBOX_DIR}/.bashrc && \
 99 |     pnpm install -g typescript ts-node @types/node prettier eslint tsx
100 | 
101 | # Install and configure Rust using rustup
102 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
103 |     . ${SANDBOX_DIR}/.cargo/env && \
104 |     rustup component add rust-src && \
105 |     echo '. ${SANDBOX_DIR}/.cargo/env' >> ${SANDBOX_DIR}/.bashrc
106 | 
107 | # Install LSP Servers for common languages with architecture awareness
108 | RUN . ${SANDBOX_DIR}/.bashrc && \
109 |     # Python - Pyright
110 |     sudo dnf install -y npm && \
111 |     sudo npm install -g pyright && \
112 |     # JavaScript/TypeScript
113 |     pnpm install -g typescript-language-server typescript && \
114 |     # Rust - Install and configure Rust using rustup
115 |     curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
116 |     . ${SANDBOX_DIR}/.cargo/env && \
117 |     rustup component add rust-src rust-analyzer && \
118 |     echo '. ${SANDBOX_DIR}/.cargo/env' >> ${SANDBOX_DIR}/.bashrc \
119 |     # Go - Install gopls
120 |     go install golang.org/x/tools/gopls@latest
121 | 
122 | # Configure environment
123 | ENV HOME=${SANDBOX_DIR}
124 | 
125 | # Copy and install some base requirements
126 | COPY base_requirements.txt /tmp/base_requirements.txt
127 | RUN pip install -r /tmp/base_requirements.txt && \
128 |     sudo rm /tmp/base_requirements.txt
129 | 
130 | # Copy and install agent dependencies (maintaining current approach)
131 | COPY --from=base_agent --chown=agent:agent . /tmp/base_agent
132 | RUN cd /tmp/base_agent && pip install -r requirements.txt
133 | 
134 | WORKDIR ${SANDBOX_DIR}
135 | 
136 | # Expose necessary ports (maintaining current approach)
137 | EXPOSE 5000 80 22 443 8080 8000
138 | 
139 | ARG ANTHROPIC_API_KEY
140 | ARG OPENAI_API_KEY
141 | ARG FIREWORKS_AI_API_KEY
142 | ARG GEMINI_API_KEY
143 | ARG DEEPSEEK_API_KEY
144 | ARG VERTEX_PROJECT_ID
145 | 
146 | ENV ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
147 | ENV OPENAI_API_KEY=${OPENAI_API_KEY}
148 | ENV FIREWORKS_AI_API_KEY=${FIREWORKS_AI_API_KEY}
149 | ENV GEMINI_API_KEY=${GEMINI_API_KEY}
150 | ENV DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY}
151 | ENV VERTEX_PROJECT_ID=${VERTEX_PROJECT_ID}
152 | 
153 | COPY GOOGLE_APPLICATION_CREDENTIALS.json /tmp/GOOGLE_APPLICATION_CREDENTIALS.json
154 | ENV GOOGLE_APPLICATION_CREDENTIALS=/tmp/GOOGLE_APPLICATION_CREDENTIALS.json
155 | 
156 | # Set the entrypoint (maintaining current approach)
157 | CMD ["/bin/bash", "--login"]
158 | 


--------------------------------------------------------------------------------
/sandbox/GOOGLE_APPLICATION_CREDENTIALS.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "service_account",
 3 |   "project_id": "",
 4 |   "private_key_id": "",
 5 |   "private_key": "",
 6 |   "client_email": "",
 7 |   "client_id": "",
 8 |   "auth_uri": "",
 9 |   "token_uri": "",
10 |   "auth_provider_x509_cert_url": "",
11 |   "client_x509_cert_url": "",
12 |   "universe_domain": ""
13 | }
14 | 


--------------------------------------------------------------------------------
/sandbox/base_requirements.txt:
--------------------------------------------------------------------------------
1 | # System-wide Python packages for development
2 | # Intentionally _not_ pinning versions so we get recent versions on every build
3 | black
4 | flake8
5 | 


--------------------------------------------------------------------------------
/sandbox/configs/gitignore:
--------------------------------------------------------------------------------
 1 | # IDEs
 2 | .idea/
 3 | .vscode/
 4 | *.swp
 5 | *.swo
 6 | 
 7 | # Build outputs
 8 | target/
 9 | dist/
10 | build/
11 | *.o
12 | *.a
13 | *.so
14 | 
15 | # Logs & temp
16 | *.log
17 | tmp/
18 | temp/
19 | 
20 | # Directories to ignore at any depth
21 | **/.maestro
22 | **/.vscode
23 | **/.vscode-server
24 | **/.ssh
25 | 


--------------------------------------------------------------------------------
/sandbox/configs/sandbox_bashrc:
--------------------------------------------------------------------------------
 1 | # sandbox_bashrc
 2 | 
 3 | # Guard against sourcing multiple times
 4 | if [ -n "$SANDBOX_BASHRC_SOURCED" ]; then
 5 |     return
 6 | fi
 7 | export SANDBOX_BASHRC_SOURCED=1
 8 | 
 9 | # If not running interactively, don't do anything
10 | [[ $- != *i* ]] && return
11 | 
12 | # User-specific environment
13 | if ! [[ "$PATH" =~ "$HOME/.local/bin:$HOME/bin:" ]]
14 | then
15 |     PATH="$HOME/.local/bin:$HOME/bin:$PATH"
16 | fi
17 | export PATH
18 | 
19 | # User specific aliases and functions
20 | if [ -d ~/.bashrc.d ]; then
21 |     for rc in ~/.bashrc.d/*; do
22 |         if [ -f "$rc" ]; then
23 |             . "$rc"
24 |         fi
25 |     done
26 | fi
27 | 
28 | unset rc
29 | 


--------------------------------------------------------------------------------
/scripts/install_swebench_harness.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | git clone https://github.com/swe-bench/SWE-bench
3 | cd SWE-bench
4 | pip install -e .
5 | 


--------------------------------------------------------------------------------