├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── config.py
├── eval_improver.py
├── helpers.py
├── language_model.py
├── run_improver.py
└── tasks
    ├── maxcut
        ├── secret_utility.py
        ├── seed_algorithm.py
        └── utility.py
    ├── meta_optimization
        ├── secret_seed_algorithm.py
        ├── secret_seed_algorithm_improved.py
        ├── secret_utility.py
        ├── seed_algorithm.py
        ├── transfer_eval.py
        └── utility.py
    ├── modified_quadratic_assignment
        ├── secret_utility.py
        ├── seed_algorithm.py
        └── utility.py
    ├── parity_noise
        ├── secret_utility.py
        ├── seed_algorithm.py
        └── utility.py
    ├── parity_noiseless
        ├── secret_utility.py
        ├── seed_algorithm.py
        └── utility.py
    ├── sandbox
        ├── seed_algorithm.py
        └── utility.py
    ├── str_grid_dist
        ├── secret_utility.py
        ├── seed_algorithm.py
        └── utility.py
    └── three_sat
        ├── secret_utility.py
        ├── seed_algorithm.py
        └── utility.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | *tmp*
156 | 
157 | self-improve/runs/pickles/*
158 | results/*
159 | usage_log.json
160 | usage_log.jsonl
161 | temp/*
162 | cache/*
163 | api_key.py
164 | creativity/*
165 | # PyCharm
166 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
169 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | #.idea/
171 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation
 2 | 
 3 | This is the repo for the paper: [Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation](https://arxiv.org/abs/2310.02304)
 4 | 
 5 | ```
 6 | @article{zelikman2023self,
 7 |   title={Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation},
 8 |   author={Eric Zelikman, Eliana Lorch, Lester Mackey, Adam Tauman Kalai},
 9 |   journal={arXiv preprint arXiv:2310.02304},
10 |   year={2023}
11 | }
12 | ```
13 | 
14 | Abstract: Several recent advances in AI systems (e.g., Tree-of-Thoughts and Program-Aided Language Models) solve problems by providing a "scaffolding" program that structures multiple calls to language models to generate better outputs. A scaffolding program is written in a programming language such as Python. In this work, we use a language-model-infused scaffolding program to improve itself. We start with a seed "improver" that improves an input program according to a given utility function by querying a language model several times and returning the best solution. We then run this seed improver to improve itself. Across a small set of downstream tasks, the resulting improved improver generates programs with significantly better performance than its seed improver. Afterward, we analyze the variety of self-improvement strategies proposed by the language model, including beam search, genetic algorithms, and simulated annealing. Since the language models themselves are not altered, this is not full recursive self-improvement. Nonetheless, it demonstrates that a modern language model, GPT-4 in our proof-of-concept experiments, is capable of writing code that can call itself to improve itself. We critically consider concerns around the development of self-improving technologies and evaluate the frequency with which the generated code bypasses a sandbox.
15 | 
16 | 
17 | # Legal Notices
18 | 
19 | Microsoft and any contributors grant you a license to any code in the repository under the [MIT License](https://opensource.org/licenses/MIT), see the
20 | [LICENSE](LICENSE) file, and grant you a license to the Microsoft documentation and other data
21 | in this repository under the [Creative Commons Attribution 4.0 International Public License](https://creativecommons.org/licenses/by/4.0/legalcode),
22 | see the [DATA_LICENSE](data/DATA_LICENSE) file. 
23 | 
24 | Microsoft, Windows, Microsoft Azure and/or other Microsoft products and services referenced in the documentation
25 | may be either trademarks or registered trademarks of Microsoft in the United States and/or other countries.
26 | The licenses for this project do not grant you rights to use any Microsoft names, logos, or trademarks.
27 | Microsoft's general trademark guidelines can be found at http://go.microsoft.com/fwlink/?LinkID=254653.
28 | 
29 | Privacy information can be found at https://privacy.microsoft.com/en-us/
30 | 
31 | Microsoft and any contributors reserve all other rights, whether under their respective copyrights, patents,
32 | or trademarks, whether by implication, estoppel or otherwise.
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | n = 6
 2 | m = 4
 3 | 
 4 | config = {
 5 |     "use_seed_algorithm": True,
 6 |     "iterative": True,
 7 |     "task": "meta_optimization",
 8 |     "subtask": "parity_noise",
 9 |     "use_improver": True,
10 |     "n_iterations": 6,
11 |     'use_language_model_cache': False,
12 |     'max_responses_per_call': n,
13 |     'language_model_call_budget': m,
14 |     'meta_utility_budget': n * m + 1,
15 |     'utility_budget': n * m + 1,
16 |     'meta_utility_tests': 5,
17 |     'transfer_eval_type': 'improved',
18 |     'use_timeout_in_improver': False,
19 |     'join_pools': False,
20 | }


--------------------------------------------------------------------------------
/eval_improver.py:
--------------------------------------------------------------------------------
1 | from tasks.meta_optimization.transfer_eval import *


--------------------------------------------------------------------------------
/helpers.py:
--------------------------------------------------------------------------------
  1 | from language_model import LanguageModel
  2 | import os
  3 | import time
  4 | import traceback
  5 | import importlib
  6 | import platform
  7 | import faulthandler
  8 | 
  9 | def extract_code(algorithm_str):
 10 |     if isinstance(algorithm_str, str):
 11 |         # If the result is wrapped in triple backticks, remove the first and last lines
 12 |         return find_largest_code_block_line_by_line(algorithm_str)
 13 |     elif isinstance(algorithm_str, list):
 14 |         extracted_codes = [extract_code(algorithm_str) for algorithm_str in algorithm_str]
 15 |         return extracted_codes
 16 | 
 17 | def find_largest_code_block_line_by_line(text):
 18 |     largest_block = ""
 19 |     current_block = ""
 20 |     nesting_level = 0  # To keep track of the level of nesting
 21 | 
 22 |     lines = text.split("\n")
 23 |     
 24 |     for line in lines:
 25 |         if line.startswith("```"):  # We've found a block delimiter
 26 |             if not line[3:].strip():  # If it's a closing delimiter
 27 |                 nesting_level -= 1  # Decrease the nesting level
 28 |                 
 29 |                 if nesting_level == 0:  # We've closed the outermost block
 30 |                     current_block += line + "\n"  # Add the line to the current block
 31 |                     
 32 |                     # Compare the length of the current block with the largest block found so far
 33 |                     if len(current_block) > len(largest_block):
 34 |                         largest_block = current_block
 35 |                     
 36 |                     current_block = ""  # Reset the current block
 37 |                 else:
 38 |                     current_block += line + "\n"  # Add the line to the current block
 39 |             else:  # It's an opening delimiter
 40 |                 current_block += line + "\n"  # Add the line to the current block
 41 |                 nesting_level += 1  # Increase the nesting level
 42 |         else:
 43 |             if nesting_level > 0:  # If we're inside a block
 44 |                 current_block += line + "\n"  # Add the line to the current block
 45 | 
 46 |     if largest_block:
 47 |         # Remove the first and last lines (the outermost backticks)
 48 |         largest_block = "\n".join(largest_block.strip().split("\n")[1:-1])
 49 | 
 50 |     return largest_block if largest_block else None
 51 | 
 52 | def run_by_creating_file(define_fn_str, base_name):
 53 |     # Create a file with the define_fn_str
 54 |     timestamp = str(time.time()).replace(".", "")
 55 |     # Make the temp directory if it doesn't exist
 56 |     os.makedirs("temp", exist_ok=True)
 57 |     write_str_to_file(define_fn_str, f"temp/{base_name}_{timestamp}.py")
 58 |     # Import the file
 59 |     imported = False
 60 |     attempts = 0
 61 |     while not imported and attempts < 10:
 62 |         try:
 63 |             importlib.invalidate_caches()
 64 |             module = importlib.import_module(f"temp.{base_name}_{timestamp}")
 65 |             imported = True
 66 |         except ModuleNotFoundError as e:
 67 |             print("Module not found, trying again")
 68 |             print("Exception:", e)
 69 |             time.sleep(0.1)
 70 |             attempts += 1
 71 |         except Exception as e:
 72 |             print("Failed to import module with exception", e)
 73 |             print(traceback.format_exc())
 74 |             raise e
 75 |     # Get the function
 76 |     fn = getattr(module, base_name)
 77 |     return fn
 78 | 
 79 | 
 80 | def reliability_guard(maximum_memory_bytes = None):
 81 |     """
 82 |     Based on humaneval sandbox - slightly less restrictive:
 83 |     https://github.com/openai/human-eval/blob/master/human_eval/execution.py
 84 | 
 85 |     This disables various destructive functions and prevents the generated code
 86 |     from interfering with the test (e.g. fork bomb, killing other processes,
 87 |     removing filesystem files, etc.)
 88 | 
 89 |     WARNING
 90 |     This function is NOT a security sandbox. Untrusted code, including, model-
 91 |     generated code, should not be blindly executed outside of one. See the 
 92 |     Codex paper for more information about OpenAI's code sandbox, and proceed
 93 |     with caution.
 94 |     """
 95 | 
 96 |     if maximum_memory_bytes is not None:
 97 |         import resource
 98 |         resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
 99 |         resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
100 |         if not platform.uname().system == 'Darwin':
101 |             resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
102 | 
103 |     faulthandler.disable()
104 | 
105 |     import builtins
106 |     builtins.exit = None
107 |     builtins.quit = None
108 | 
109 |     os.kill = None
110 |     os.system = None
111 |     # os.putenv = None
112 |     os.remove = None
113 |     os.removedirs = None
114 |     os.rmdir = None
115 |     os.fchdir = None
116 |     os.setuid = None
117 |     os.fork = None
118 |     os.forkpty = None
119 |     os.killpg = None
120 |     os.rename = None
121 |     os.renames = None
122 |     os.truncate = None
123 |     os.replace = None
124 |     os.unlink = None
125 |     os.fchmod = None
126 |     os.fchown = None
127 |     os.chmod = None
128 |     os.chown = None
129 |     os.chroot = None
130 |     os.fchdir = None
131 |     os.lchflags = None
132 |     os.lchmod = None
133 |     os.lchown = None
134 |     os.getcwd = None
135 |     os.chdir = None
136 | 
137 |     import shutil
138 |     shutil.rmtree = None
139 |     shutil.move = None
140 |     shutil.chown = None
141 | 
142 |     # import subprocess
143 |     # subprocess.Popen = None  # type: ignore
144 | 
145 |     __builtins__['help'] = None
146 | 
147 |     import sys
148 |     sys.modules['ipdb'] = None
149 |     # sys.modules['joblib'] = None
150 |     sys.modules['resource'] = None
151 |     sys.modules['psutil'] = None
152 |     sys.modules['tkinter'] = None
153 | 
154 | def temp_override(define_fn_str, base_name, update_globals=True, use_sandbox=True, strict_sandbox=True):
155 |     """
156 |     Overrides a function temporarily.
157 |     """
158 |     if define_fn_str is None:
159 |         raise Exception("define_fn_str is None in temp_override")
160 |     if use_sandbox:
161 |         filtered_strings = ['ProcessPool']
162 |         for filtered_str in filtered_strings:
163 |             if filtered_str in define_fn_str:
164 |                 raise Exception(f"{filtered_str} is not supported in temp_override")
165 |         if strict_sandbox:
166 |             if not os.path.exists("acknowledge_strict_sandbox.txt"):
167 |                 print("WARNING: Although this script mode is less likely to crash your computer, it is still not a true sandbox and may cause your computer to crash. You should not run this script anywhere where you would not allow a stranger to run arbitrary code.")
168 |                 acknowledge_strict_sandbox = input("Confirm that you acknowledge this (y/n): ")
169 |                 if acknowledge_strict_sandbox != "y":
170 |                     raise Exception("Aborting due to sandbox warning")
171 |                 else:
172 |                     write_str_to_file("", "acknowledge_strict_sandbox.txt")
173 |             reliability_guard()
174 |         else:
175 |             if not os.path.exists("acknowledge_unsafe.txt"):
176 |                 print("WARNING: You are using temp_override without a strict sandbox. This is particularly unsafe and may cause your computer to crash.")
177 |                 acknowledge_unsafe = input("Confirm that you acknowledge this (y/n): ")
178 |                 if acknowledge_unsafe != "y":
179 |                     raise Exception("Aborting due to sandbox warning")
180 |                 else:
181 |                     write_str_to_file("", "acknowledge_unsafe.txt")
182 |                 
183 | 
184 |     new_globals = globals().copy()
185 |     if base_name in new_globals:
186 |         del new_globals[base_name]
187 |     new_fn = run_by_creating_file(define_fn_str, base_name)  # Useful for debugging
188 |     if base_name in new_globals:
189 |         del new_globals[base_name]
190 |     if update_globals:
191 |         globals().update(new_globals)
192 |     return new_fn
193 | 
194 | def read_file_as_str(path):
195 |     with open(path, "r") as f:
196 |         return f.read()
197 | 
198 | def write_str_to_file(s, path, mode="w"):
199 |     if isinstance(s, list):
200 |         s = "\n\n".join(s)
201 |     try:
202 |         with open(path, mode) as f:
203 |             f.write(s)
204 |     except Exception as e:
205 |         print("Failed to write to file", path, "with exception", e)
206 |         print("Traceback:", traceback.format_exc())
207 |         s = str(s)
208 |         with open(path, mode) as f:
209 |             f.write(s)
210 | 
211 | def generate_seed_algorithm(utility_str, t=0.7):
212 |     """
213 |     Implements an algorithm according to a utility function.
214 |     """
215 |     role = "You are an expert programmer, especially skilled at implementing algorithms."
216 |     message = f"""You must write a script that will implement a Python algorithm to solve a problem as well as possible.
217 | 
218 | You will be evaluated based on the following utility function:
219 | ```python
220 | {utility_str}
221 | ```
222 | """
223 |     language_model = LanguageModel(budget=1)
224 |     algorithm_str = language_model.prompt(role, message, n_responses=1, temperature=t)[0]
225 |     algorithm_str = extract_code(algorithm_str)
226 |     return algorithm_str
227 | 
228 | def generate_run_id(iterative, use_seed_algorithm, use_improver, SUBTASK):
229 |     """
230 |     Generates a unique run ID for a run.
231 |     """
232 |     run_id = str(int(time.time()))
233 |     if iterative:
234 |         run_id += "_iterative"
235 |     if use_seed_algorithm:
236 |         run_id += "_seed"
237 |     if use_improver:
238 |         run_id += "_improver"
239 |     run_id += f"_{SUBTASK}"
240 |     return run_id
241 | 
242 | def load_seed_algorithm(task, utility_str, use_existing=False):
243 |     """
244 |     Gets the seed algorithm for a task.
245 |     """
246 |     seed_algorithm_path = f"tasks/{task}/seed_algorithm.py"
247 |     if use_existing and not os.path.exists(seed_algorithm_path):
248 |         seed_algorithm_str = generate_seed_algorithm(utility_str)
249 |         write_str_to_file(seed_algorithm_str, seed_algorithm_path)
250 |     if use_existing:
251 |         return read_file_as_str(seed_algorithm_path)
252 |     return seed_algorithm_str
253 | 
254 | def get_utility_strs(task):
255 |     """
256 |     Gets the utility function for a task.
257 |     """
258 |     return read_file_as_str(f"tasks/{task}/utility.py"), read_file_as_str(f"tasks/{task}/secret_utility.py")
259 | 
260 | def end_pool_if_used(pool, join_pools=False):
261 |     """
262 |     Ends a pool if it is used.
263 |     """
264 |     if pool is not None:
265 |         pool.stop()
266 |         if join_pools:
267 |             pool.join()
268 | 
269 | def write_log(expected_utility_val, expected_utility_test, run_id):
270 |     """
271 |     Writes the expected utility to a log file.
272 |     """
273 |     print("Saving to file")
274 |     with open(f"results/{run_id}/meta_utility_log.txt", "a") as f:
275 |         f.write(f"{expected_utility_val},{expected_utility_test}\n")
276 | 


--------------------------------------------------------------------------------
/language_model.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import openai
  3 | import time
  4 | import random
  5 | import os
  6 | import traceback
  7 | import jsonlines
  8 | import hashlib
  9 | from config import config    
 10 | try:
 11 |     from api_key import openai_key
 12 | except:
 13 |     raise Exception("Create an api_key.py file with a dict including your OpenAI API key.")
 14 | from concurrent.futures import ThreadPoolExecutor as ThreadPool
 15 | from collections import Counter
 16 | 
 17 | MAX_BATCH = 5  # most number to query at once
 18 | MAX_TOKENS = 1024
 19 | 
 20 | def set_openai_key():
 21 |     """Sets OpenAI key."""
 22 |     if 'api_type' in openai_key and openai_key['api_type'] == 'azure':
 23 |         openai.api_type = "azure"
 24 |         openai.api_version = "2023-03-15-preview"
 25 |         openai.api_base = openai_key['api_base']
 26 |     else:
 27 |         if 'organization' in openai_key:
 28 |             openai.organization = openai_key['organization']
 29 |     openai.api_key = openai_key['api_key']
 30 | 
 31 | if openai.api_type == 'azure':
 32 |     # These will depend on how you deployed the OpenAI resource on Azure
 33 |     CHAT_GPT_35 = "gpt-35-turbo"
 34 |     CHAT_GPT_4 = "gpt-4"
 35 | else:
 36 |     CHAT_GPT_35 = "gpt-3.5-turbo-0301"
 37 |     CHAT_GPT_4 = "gpt-4-0314"
 38 | default_engine = CHAT_GPT_4
 39 | 
 40 | cache_counter = defaultdict(int)
 41 | set_openai_key()
 42 | 
 43 | def write_to_usage_log(role, message, n, t):
 44 |     # Write to usage log - using jsonlines for easy appending
 45 |     try:
 46 |         with jsonlines.open("usage_log.jsonl", "a") as writer:
 47 |             writer.write([
 48 |                 time.time(),
 49 |                 {
 50 |                     "role": role,
 51 |                     "message": message,
 52 |                     "n": n,
 53 |                     "t": t,
 54 |                 }
 55 |             ])
 56 |     except Exception as e:
 57 |         print("Failed to write to usage log with exception", e)
 58 |         print("Traceback:", traceback.format_exc())
 59 | 
 60 | class LanguageModel:
 61 |     def __init__(self, budget):
 62 |         """
 63 |         Initializes the language model.
 64 | 
 65 |         Args:
 66 |             role (str): The role of the language model.
 67 |         """
 68 |         self.next_request_time = time.time()
 69 |         self.rate_limit = 0.5  # Initial requests per second - will automatically lower if not feasible
 70 |         self.min_rate_limit = 15 / 60  # Can't be lower than this
 71 |         self.global_timeout = 1024
 72 |         if not os.path.exists("cache"):
 73 |             os.mkdir("cache")
 74 |         self.cache_counter = cache_counter
 75 |         self.use_cache = config['use_language_model_cache']
 76 |         self.times_used = 0
 77 |         self.budget = budget
 78 |         self.max_responses_per_call = config['max_responses_per_call']
 79 | 
 80 |     def prompt(self, expertise, message, n_responses=1, temperature=0.7):
 81 |         """
 82 |         Generates a response to a message.
 83 | 
 84 |         Args:
 85 |             expertise (str): The expertise of the language model.
 86 |             message (str): The message to respond to.
 87 |             n (int): The number of responses to generate.
 88 |             t (float): The temperature of the language model.
 89 | 
 90 |         Returns:
 91 |             responses (list of str): The responses generated by the language model.
 92 |         """
 93 |         # Make sure the message is a string
 94 |         role = expertise
 95 |         assert isinstance(message, str)
 96 |         assert isinstance(role, str)
 97 |         assert n_responses <= self.max_responses_per_call
 98 |         self.times_used += 1
 99 |         if self.times_used > self.budget:
100 |             raise Exception("Error: You have exceeded your call budget.")
101 |         n, t = n_responses, temperature
102 |         cache_key = str((message, n, t))
103 |         cache_file = f"{hashlib.sha256(cache_key.encode()).hexdigest()}.jsonl"
104 |         write_to_usage_log(role, message, n, t)
105 |         if self.use_cache:
106 |             if cache_file in os.listdir("cache"):
107 |                 self.cache_counter[cache_file] += 1
108 |                 # Open the file as a jsonl file
109 |                 try:
110 |                     with jsonlines.open(f"cache/{cache_file}") as reader:
111 |                         messages = [message for message in reader]
112 |                 except Exception as e:
113 |                     print("Failed to read from cache with exception", e)
114 |                     print("Traceback:", traceback.format_exc())
115 |                     messages = []
116 |                     # Remove the cache file
117 |                     try:
118 |                         os.remove(f"cache/{cache_file}")
119 |                     except Exception as e:
120 |                         print("Failed to remove cache file with exception", e)
121 |                         print("Traceback:", traceback.format_exc())
122 |                 if len(messages) > self.cache_counter[cache_file]:
123 |                     return messages[self.cache_counter[cache_file] - 1]
124 |         max_n = MAX_BATCH
125 |         max_tokens = MAX_TOKENS
126 |         next_request_time = self.next_request_time
127 |         rate_limit = self.rate_limit
128 |         global_timeout = self.global_timeout
129 |         min_rate_limit = self.min_rate_limit
130 |         engine = default_engine
131 |         system = role
132 |         user = [message]
133 |         n_max_messages = 8
134 |         temperature = t
135 | 
136 |         # Prompt the model
137 |         remaining = n
138 |         remaining_to_submit = n
139 |         error_count = 0
140 |         while time.time() < next_request_time:
141 |             time.sleep(random.random() * 0.01)
142 | 
143 |         # Tell everyone to wait for one rate limit iteration per call
144 |         next_request_time = time.time() + (n / max_n) / rate_limit
145 |         messages=[{"role": "system", "content": system}]
146 |         for u_id, u in enumerate(user):
147 |             role = "user" if u_id % 2 == 0 else "assistant"
148 |             messages.append({"role": role, "content": u})
149 |         if len(messages) > n_max_messages:
150 |             if n_max_messages == 1:
151 |                 raise Exception("Error: This will exclude the system prompt.")
152 |             else:
153 |                 messages = messages[:n_max_messages // 2] + messages[-n_max_messages // 2:]
154 |         # Select a backend at random
155 |         result = []
156 |         print(f"Querying OpenAI API with messages... {remaining} left")
157 | 
158 |         def query_api(cur_n):
159 |             set_openai_key()
160 |             if openai.api_type == "azure":
161 |                 result.extend(openai.ChatCompletion.create(
162 |                     engine=engine, messages=messages, n=cur_n, temperature=temperature,
163 |                     max_tokens=max_tokens, timeout=global_timeout * cur_n
164 |                 ).choices)
165 |             else:
166 |                 result.extend(openai.ChatCompletion.create(
167 |                     model=engine, messages=messages, n=cur_n, temperature=temperature,
168 |                     max_tokens=max_tokens, timeout=global_timeout * cur_n
169 |                 ).choices)
170 | 
171 |         threadpool = ThreadPool(max_workers=16)
172 |         threadpool_futures = []
173 |         # Submit the requests in parallel
174 |         while remaining_to_submit > 0:
175 |             cur_n = min(max_n, remaining_to_submit)
176 |             threadpool_futures.append((cur_n, threadpool.submit(query_api, cur_n)))
177 |             remaining_to_submit -= cur_n
178 |             time.sleep(rate_limit)
179 | 
180 |         # Wait for the requests to finish. If they fail, retry them.
181 |         while remaining > 0:
182 |             if error_count > 10:
183 |                 break
184 |             new_threadpool_futures = []
185 |             for future_n, future in threadpool_futures:
186 |                 try:
187 |                     future.result()
188 |                     remaining -= future_n
189 |                     rate_limit *= 1.01
190 |                     print("Success! Queried OpenAI API with", future_n, "messages.")
191 |                 except Exception as e:
192 |                     if "https://aka.ms/oai/quotaincrease" not in str(e) and "Rate limit reached for" not in str(e):
193 |                         print("Error while querying OpenAI API. Retrying...", e)
194 |                         error_count += 1
195 |                     else:
196 |                         rate_limit = max(rate_limit * 0.9, min_rate_limit)
197 |                     new_threadpool_futures.append((future_n, threadpool.submit(query_api, future_n,)))
198 |             threadpool_futures = new_threadpool_futures
199 | 
200 |         if remaining == 0:
201 |             if "improve_algorithm" in message:
202 |                 write_time = time.time()
203 |                 save_folder = f"creativity/{int(write_time)}"
204 |                 os.makedirs(save_folder, exist_ok=True)
205 |                 with open(f"{save_folder}/message.txt", "w") as writer:
206 |                     writer.write(message)
207 |                 for res_idx, res in enumerate(result):
208 |                     with open(f"{save_folder}/response_{res_idx}.txt", "w") as writer:
209 |                         writer.write(res.message.content)
210 |         # Convert the response to a list of strings (or one string if n=1)
211 |         results = [choice.message.content for choice in result]
212 |         if cache_file not in os.listdir("cache"):
213 |             with jsonlines.open(f"cache/{cache_file}", "w") as writer:
214 |                 writer.write(results)
215 |         else:
216 |             try:
217 |                 with jsonlines.open(f"cache/{cache_file}", "a") as writer:
218 |                     writer.write(results)
219 |             except Exception as e:
220 |                 print("Failed to write to existing cache with exception", e)
221 |                 print("Traceback:", traceback.format_exc())
222 |         if self.use_cache:
223 |             self.cache_counter[cache_file] += 1
224 |         return results
225 | 
226 |     def batch_prompt(self, expertise, message_batch, temperature):
227 |         """
228 |         Generates a response to a message.
229 | 
230 |         Args:
231 |             expertise (str): The expertise of the language model.
232 |             message (str): The message to respond to.
233 |             n (int): The number of responses to generate.
234 |             t (float): The temperature of the language model.
235 | 
236 |         Returns:
237 |             responses (list of str): The responses generated by the language model.
238 |         """
239 |         # Make sure the message is a string
240 |         role = expertise
241 |         try:
242 |             message_batch = list(message_batch)
243 |         except:
244 |             return []
245 |         assert len(message_batch) <= self.max_responses_per_call
246 |         assert isinstance(message_batch[0], str)
247 |         assert isinstance(role, str)
248 |         self.times_used += 1
249 |         if self.times_used > self.budget:
250 |             raise Exception("Error: You have exceeded your call budget.")
251 |         n, t = 1, temperature
252 |         cache_key = str((str(message_batch), n, t))
253 |         cache_file = f"{hashlib.sha256(cache_key.encode()).hexdigest()}.jsonl"
254 |         write_to_usage_log(role, str(message_batch), n, t)
255 |         if self.use_cache:
256 |             if cache_file in os.listdir("cache"):
257 |                 self.cache_counter[cache_file] += 1
258 |                 # Open the file as a jsonl file
259 |                 try:
260 |                     with jsonlines.open(f"cache/{cache_file}") as reader:
261 |                         messages = [message for message in reader]
262 |                 except Exception as e:
263 |                     print("Failed to read from cache with exception", e)
264 |                     print("Traceback:", traceback.format_exc())
265 |                     messages = []
266 |                     # Remove the cache file
267 |                     try:
268 |                         os.remove(f"cache/{cache_file}")
269 |                     except Exception as e:
270 |                         print("Failed to remove cache file with exception", e)
271 |                         print("Traceback:", traceback.format_exc())
272 |                 if len(message_batch) > self.cache_counter[cache_file]:
273 |                     return messages[self.cache_counter[cache_file] - 1]
274 |         max_n = MAX_BATCH
275 |         max_tokens = MAX_TOKENS
276 |         next_request_time = self.next_request_time
277 |         rate_limit = self.rate_limit
278 |         global_timeout = self.global_timeout
279 |         min_rate_limit = self.min_rate_limit
280 |         engine = default_engine
281 |         system = role
282 |         n_max_messages = 8
283 |         temperature = t
284 | 
285 |         # Prompt the model
286 |         error_count = 0
287 |         while time.time() < next_request_time:
288 |             time.sleep(random.random() * 0.01)
289 | 
290 |         # Select a backend at random
291 |         result = []
292 |         print(f"Querying OpenAI API with messages... {len(message_batch)} left")
293 | 
294 |         def query_api(user, n_responses=1):
295 |             # Tell everyone to wait for one rate limit iteration per call
296 |             next_request_time = time.time() + (n / max_n) / rate_limit
297 |             messages=[{"role": "system", "content": system}]
298 |             for u_id, u in enumerate(user):
299 |                 role = "user" if u_id % 2 == 0 else "assistant"
300 |                 messages.append({"role": role, "content": u})
301 |             if len(messages) > n_max_messages:
302 |                 if n_max_messages == 1:
303 |                     raise Exception("Error: This will exclude the system prompt.")
304 |                 else:
305 |                     messages = messages[:n_max_messages // 2] + messages[-n_max_messages // 2:]
306 | 
307 |             set_openai_key()
308 |             if openai.api_type == "azure":
309 |                 result.extend(openai.ChatCompletion.create(
310 |                     engine=engine, messages=messages, n=n_responses, temperature=temperature,
311 |                     max_tokens=max_tokens, timeout=global_timeout
312 |                 ).choices)
313 |             else:
314 |                 result.extend(openai.ChatCompletion.create(
315 |                     model=engine, messages=messages, n=n_responses, temperature=temperature,
316 |                     max_tokens=max_tokens, timeout=global_timeout
317 |                 ).choices)
318 | 
319 |         threadpool = ThreadPool(max_workers=16)
320 |         threadpool_futures = []
321 |         # Submit the requests in parallel - group them together to save on time
322 |         message_batch_counts = Counter(message_batch)
323 |         for message, count in message_batch_counts.items():
324 |             threadpool_futures.append((message, threadpool.submit(query_api, [message], count)))
325 |             time.sleep(rate_limit)
326 | 
327 |         # Wait for the requests to finish. If they fail, retry them.
328 |         while len(threadpool_futures) > 0:
329 |             if error_count > 10:
330 |                 break
331 |             new_threadpool_futures = []
332 |             for future_message, future in threadpool_futures:
333 |                 try:
334 |                     future.result()
335 |                     rate_limit *= 1.01
336 |                     print("Success! Queried OpenAI API")
337 |                 except Exception as e:
338 |                     if "https://aka.ms/oai/quotaincrease" not in str(e) and "Rate limit reached for" not in str(e):
339 |                         print("Error while querying OpenAI API. Retrying...", e)
340 |                         error_count += 1
341 |                     else:
342 |                         rate_limit = max(rate_limit * 0.9, min_rate_limit)
343 |                     new_threadpool_futures.append((future_message, threadpool.submit(query_api, [future_message],)))
344 |             threadpool_futures = new_threadpool_futures
345 | 
346 |         write_time = time.time()
347 |         save_folder = f"creativity/{int(write_time)}"
348 |         for res_idx, (message, cur_result) in enumerate(zip(message_batch, result)):
349 |             if "improve_algorithm" in message:
350 |                 os.makedirs(save_folder, exist_ok=True)
351 |                 with open(f"{save_folder}/message.txt", "w") as writer:
352 |                     writer.write(message)
353 |                 with open(f"{save_folder}/response_{res_idx}.txt", "w") as writer:
354 |                     writer.write(cur_result.message.content)
355 |         # Convert the response to a list of strings (or one string if n=1)
356 |         results = [choice.message.content for choice in result]
357 |         if cache_file not in os.listdir("cache"):
358 |             with jsonlines.open(f"cache/{cache_file}", "w") as writer:
359 |                 writer.write(results)
360 |         else:
361 |             try:
362 |                 with jsonlines.open(f"cache/{cache_file}", "a") as writer:
363 |                     writer.write(results)
364 |             except Exception as e:
365 |                 print("Failed to write to existing cache with exception", e)
366 |                 print("Traceback:", traceback.format_exc())
367 |         if self.use_cache:
368 |             self.cache_counter[cache_file] += 1
369 |         return results
370 | 
371 | def test_lm():
372 |     global MAX_TOKENS, default_engine
373 |     MAX_TOKENS = 20
374 |     config['use_language_model_cache'] = True
375 |     default_engine = CHAT_GPT_35
376 |     lm = LanguageModel(100)
377 |     print(lm.prompt("You respond in three word answers.", "What's a great day?", 6, 0.2))
378 |     print(lm.batch_prompt("You respond in three word answers.", [f"what is {i} + {i}?" for i in range(6)], 0.7))
379 | 
380 | if __name__ == "__main__":
381 |     test_lm()


--------------------------------------------------------------------------------
/run_improver.py:
--------------------------------------------------------------------------------
  1 | from tasks.meta_optimization.secret_seed_algorithm import improve_algorithm as initial_improve_algorithm
  2 | from helpers import (
  3 |     load_seed_algorithm, write_str_to_file,
  4 |     generate_run_id, get_utility_strs,
  5 |     temp_override, read_file_as_str, end_pool_if_used
  6 | )
  7 | from config import config
  8 | from pebble import ProcessPool as Pool
  9 | import multiprocess
 10 | from language_model import cache_counter, LanguageModel
 11 | import time
 12 | import os
 13 | import traceback
 14 | import argparse
 15 | 
 16 | use_seed_algorithm = config["use_seed_algorithm"]
 17 | task = config["task"]
 18 | 
 19 | utility_str, secret_utility_str = get_utility_strs(task)
 20 | from tasks.meta_optimization.secret_utility import meta_utility
 21 | # Wipe usage_log.jsonl
 22 | with open("usage_log.jsonl", "w") as f:
 23 |     f.write("")
 24 | 
 25 | def pre_utility_hook(cur_utility_fn):
 26 |     """
 27 |     A hook function that resets the usage count of the current utility function,
 28 |     initializes a new language model, and clears the cache counter.
 29 | 
 30 |     Args:
 31 |         cur_utility_fn: The current utility function.
 32 | 
 33 |     Returns:
 34 |         A new language model.
 35 |     """
 36 |     cur_utility_fn.uses = 0
 37 |     language_model = LanguageModel(budget=config['language_model_call_budget'])
 38 |     cache_counter.clear()
 39 |     return language_model
 40 | 
 41 | def try_load_seed_algorithm(improver_filename, cur_utility_fn):
 42 |     """
 43 |     Attempts to load the seed algorithm from a file.
 44 | 
 45 |     Args:
 46 |         improver_filename: The name of the file containing the seed algorithm.
 47 |         cur_utility_fn: The current utility function.
 48 | 
 49 |     Returns:
 50 |         The loaded seed algorithm if successful, None otherwise.
 51 |     """
 52 |     try:
 53 |         improver_str = read_file_as_str(f"results/{resume_from}/{improver_filename}")
 54 |         base_meta_algorithm_str = improver_str
 55 |         improve_algorithm = temp_override(improver_str, "improve_algorithm")
 56 |         cur_utility_fn(base_meta_algorithm_str, handle_exceptions=False)  # Make sure it runs
 57 |         return improver_str, base_meta_algorithm_str, improve_algorithm
 58 |     except Exception as e:
 59 |         print("Failed to load from", improver_filename, "with exception", e)
 60 |         print(traceback.format_exc())
 61 |         return None, None, None
 62 | 
 63 | def get_from_seed(seed_filenames, evaluated_initial_utility, cur_utility_fn):
 64 |     """
 65 |     Attempts to load the seed algorithm from a list of filenames.
 66 | 
 67 |     Args:
 68 |         seed_filenames: The list of filenames containing the seed algorithms.
 69 |         evaluated_initial_utility: A boolean indicating whether the initial utility has been evaluated.
 70 |         cur_utility_fn: The current utility function.
 71 | 
 72 |     Returns:
 73 |         A tuple containing the loaded seed algorithm, the base meta algorithm, the improve algorithm, 
 74 |         and a boolean indicating whether the initial utility has been evaluated.
 75 |     """
 76 |     loaded_successfully = False
 77 |     while len(seed_filenames) > 0:
 78 |         improver_filename = seed_filenames.pop()
 79 |         improver_str, base_meta_algorithm_str, improve_algorithm = try_load_seed_algorithm(improver_filename, cur_utility_fn)
 80 |         if improver_str is not None:
 81 |             loaded_successfully = True
 82 |             break
 83 |     if not loaded_successfully:
 84 |         improver_str = load_seed_algorithm("meta_optimization", utility_str, use_seed_algorithm)
 85 |         base_meta_algorithm_str = load_seed_algorithm(task, utility_str, use_seed_algorithm)
 86 |         improve_algorithm = initial_improve_algorithm
 87 |         cur_utility_fn(base_meta_algorithm_str, log_usage=not evaluated_initial_utility)
 88 |         evaluated_initial_utility = True
 89 |     return improver_str, base_meta_algorithm_str, improve_algorithm, evaluated_initial_utility
 90 | 
 91 | def initialize_pool(use_timeout):
 92 |     """
 93 |     Initializes the process pool, if necessary.
 94 |     """
 95 |     if not use_timeout:
 96 |         return None
 97 |     return Pool(context=multiprocess.get_context('fork'))
 98 | 
 99 | def initialize_seed_list(resume_from):
100 |     """
101 |     Initializes the list of seed algorithm filenames that can be used for the optimization.
102 | 
103 |     Args:
104 |         resume_from: The name of the run to resume from.
105 | 
106 |     Returns:
107 |         A tuple containing the list of seed algorithms and the starting iteration number.
108 |     """
109 |     if resume_from is None:
110 |         return [], 0
111 |     
112 |     seed_algorithms = [
113 |         f for f in os.listdir(f"results/{resume_from}")
114 |         if f.startswith("improved_algorithm") and f.count("_") == 2
115 |     ]
116 |     seed_algorithm_indices = [int(f.split("_")[-1].split(".")[0]) for f in seed_algorithms]
117 |     start_iter = max(seed_algorithm_indices) + 1
118 |     seed_zip = list(sorted(zip(seed_algorithm_indices, seed_algorithms)))
119 |     seed_list = [f for _, f in seed_zip]
120 |     return seed_list, start_iter
121 | 
122 | def attempt_algorithm_improvement(algorithm_to_improve, cur_utility_fn, improve_algorithm, previous_improve_algorithm):
123 |     """
124 |     Attempts to improve the given algorithm using the specified improve algorithm.
125 | 
126 |     Returns:
127 |         A tuple containing a boolean indicating whether the improvement was successful, the new algorithm string, and the improve algorithm used.
128 |     """
129 |     pool = initialize_pool(config["use_timeout_in_improver"])
130 |     language_model = pre_utility_hook(cur_utility_fn)
131 |     successful_improvement = False
132 |     new_algorithm_str = None
133 |     
134 |     try:
135 |         if config["use_timeout_in_improver"]:
136 |             new_algorithm_future = pool.schedule(improve_algorithm, (algorithm_to_improve, cur_utility_fn, language_model))
137 |             new_algorithm_str = new_algorithm_future.result(timeout=2 * 60 * 60)
138 |         else:
139 |             new_algorithm_str = improve_algorithm(algorithm_to_improve, cur_utility_fn, language_model)
140 |         language_model = pre_utility_hook(cur_utility_fn)
141 |         checked_utility = cur_utility_fn(new_algorithm_str, log_usage=True)
142 |         if checked_utility == 0:
143 |             raise Exception("Checked utility is 0")
144 |         successful_improvement = True  
145 |     except Exception as e:
146 |         print("Exception in improving, reverting to previous algorithm:", e, "\n", traceback.format_exc())
147 |         improve_algorithm = previous_improve_algorithm
148 | 
149 |     end_pool_if_used(pool, join_pools=config["join_pools"])
150 |     return successful_improvement, new_algorithm_str, improve_algorithm
151 | 
152 | def run_improver_main(resume_from=None):
153 |     """
154 |     The main function for the improver. Iteratively improves the target algorithm using the improve algorithm.
155 |     If self target is enabled, the algorithm being improved is the same as the improve algorithm.
156 |     """
157 |     is_new_run = resume_from is None
158 |     evaluated_initial_utility = not is_new_run
159 |     cur_utility_fn = meta_utility
160 |     seed_list, start_iter = initialize_seed_list(resume_from)
161 |     improver_str, algorithm_to_improve, improve_algorithm, evaluated_initial_utility = get_from_seed(
162 |         seed_list, evaluated_initial_utility, cur_utility_fn)
163 |     _, _, previous_improve_algorithm, evaluated_initial_utility = get_from_seed(
164 |         seed_list, evaluated_initial_utility, cur_utility_fn)
165 | 
166 |     for cur_iter in range(start_iter, config["n_iterations"]):
167 |         write_str_to_file(algorithm_to_improve, f"results/{run_id}/seed_algorithm_{cur_iter}.py")
168 |         successful_improvement, new_algorithm_str, improve_algorithm = attempt_algorithm_improvement(
169 |             algorithm_to_improve, cur_utility_fn, improve_algorithm, previous_improve_algorithm
170 |         )
171 | 
172 |         if successful_improvement:
173 |             # Save algorithm to file
174 |             write_str_to_file(new_algorithm_str, f"results/{run_id}/improved_algorithm_{cur_iter}.py")
175 |             if config["iterative"]:
176 |                 improver_str = new_algorithm_str
177 |             algorithm_to_improve = new_algorithm_str
178 |             previous_improve_algorithm = improve_algorithm
179 |             improve_algorithm = temp_override(improver_str, "improve_algorithm")
180 | 
181 | if __name__ == "__main__":
182 |     parser = argparse.ArgumentParser()
183 |     parser.add_argument("--resume_from", type=str, default=None)
184 |     parser.add_argument("--resume_from_last", action="store_true")
185 |     args = parser.parse_args()
186 | 
187 |     resume_from = args.resume_from
188 |     if args.resume_from_last:
189 |         # Loop over the ints at the start of the run IDs
190 |         run_ids = [f for f in os.listdir("results")]
191 |         run_ids = [f for f in run_ids if f[0].isdigit()]
192 |         run_ids = sorted(run_ids, key=lambda x: int(x.split("_")[0]))
193 |         run_id = run_ids[-1]
194 |         resume_from = run_id
195 |     if resume_from is not None:
196 |         run_id = resume_from
197 |     else:
198 |         run_id = generate_run_id(
199 |             config["iterative"], use_seed_algorithm, config["use_improver"], config["subtask"]
200 |         )
201 |         # Create a folder for the results
202 |         os.makedirs(f"results/{run_id}", exist_ok=True)
203 |     run_improver_main()


--------------------------------------------------------------------------------
/tasks/maxcut/secret_utility.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from helpers import temp_override, read_file_as_str
 4 | from pebble import ThreadPool
 5 | from config import config
 6 | 
 7 | def utility(algorithm_str: str, mode: str = "val"):
 8 |     """
 9 |     Implements the Max-Cut utility function. Returns the average cut weight.
10 |     If the algorithm requires more than 100 milliseconds to run per test, it is a failure.
11 |     """
12 |     # Uncomment to limit the number of times the algorithm can be used
13 |     # uses = getattr(utility, "uses", 0)
14 |     # if uses >= utility.budget:
15 |     #     return 0
16 |     # if not algorithm_str:
17 |     #     print(f"algorithm_str is {repr(algorithm_str)}, returning 0")
18 |     #     return 0
19 |     # utility.uses = uses + 1
20 | 
21 |     n_tests = 100
22 |     min_n_nodes = 50
23 |     max_n_nodes = 200
24 |     average_cut_weight = 0
25 |     base_seed = 4321 if mode == "val" else 5678
26 |     pool = ThreadPool()
27 |     eps = 1e-2
28 | 
29 |     try:
30 |         algorithm = temp_override(algorithm_str, "algorithm")
31 |     except Exception as e:
32 |         return eps
33 | 
34 |     for test_idx in range(n_tests):
35 |         random.seed(base_seed + test_idx)  # Consistent seeding for evaluation
36 |         np.random.seed(base_seed + test_idx)
37 | 
38 |         n_nodes = random.randint(min_n_nodes, max_n_nodes)
39 |         p_edge = 0.4
40 |         max_weight = 10
41 | 
42 |         # Generate random adjacency matrix
43 |         adjacency_matrix = np.zeros((n_nodes, n_nodes))
44 |         for i in range(n_nodes):
45 |             for j in range(i+1, n_nodes):
46 |                 if random.random() < p_edge:
47 |                     weight = random.randint(1, max_weight)
48 |                     adjacency_matrix[i, j] = weight
49 |                     adjacency_matrix[j, i] = weight
50 | 
51 |         # Run the algorithm to find the partition
52 |         try:
53 |             partition_future = pool.schedule(algorithm, (adjacency_matrix,))
54 |             partition = partition_future.result(timeout=0.1)
55 |             if len(partition) != n_nodes:
56 |                 return 0
57 |             cut_weight = 0
58 |             for i in range(n_nodes):
59 |                 for j in range(i+1, n_nodes):
60 |                     if partition[i] != partition[j]:
61 |                         cut_weight += adjacency_matrix[i, j]
62 |         except Exception as e:
63 |             if e.__class__.__name__ != "TimeoutError":
64 |                 print("Exception:", e)
65 |             return eps
66 | 
67 |         average_cut_weight += cut_weight / n_tests / max_weight
68 | 
69 |     return max(average_cut_weight, eps)
70 | 
71 | utility.budget = config["utility_budget"]
72 | fake_self_str = read_file_as_str(f"tasks/maxcut/utility.py")
73 | utility.str = fake_self_str
74 | utility.uses = 0


--------------------------------------------------------------------------------
/tasks/maxcut/seed_algorithm.py:
--------------------------------------------------------------------------------
 1 | def algorithm(adjacency_matrix):
 2 |     n_nodes = len(adjacency_matrix)
 3 |     partition = [-1] * n_nodes
 4 |     unpartitioned_nodes = set(range(n_nodes))
 5 | 
 6 |     while len(unpartitioned_nodes) > 0:
 7 |         max_cut_weight = -1
 8 |         max_cut_node = None
 9 |         max_cut_partition = None
10 | 
11 |         for node in unpartitioned_nodes:
12 |             for partition_id in [0, 1]:
13 |                 cut_weight = 0
14 |                 for neighbor, weight in enumerate(adjacency_matrix[node]):
15 |                     if partition[neighbor] == 1 - partition_id:
16 |                         cut_weight += weight
17 | 
18 |                 if cut_weight > max_cut_weight:
19 |                     max_cut_weight = cut_weight
20 |                     max_cut_node = node
21 |                     max_cut_partition = partition_id
22 | 
23 |         partition[max_cut_node] = max_cut_partition
24 |         unpartitioned_nodes.remove(max_cut_node)
25 | 
26 |     return partition


--------------------------------------------------------------------------------
/tasks/maxcut/utility.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | def utility(algorithm_str: str):
 5 |     """
 6 |     Implements the Max-Cut utility function. Returns the average cut weight.
 7 |     If the algorithm requires more than 100 milliseconds to run per test, it is a failure.
 8 |     """
 9 | 
10 |     n_tests = 3
11 |     average_cut_weight = 0
12 | 
13 |     try:
14 |         exec(algorithm_str, globals())
15 |     except:
16 |         return 0
17 | 
18 |     for test_idx in range(n_tests):
19 |         n_nodes = 300
20 |         p_edge = 0.4
21 |         max_weight = 10
22 | 
23 |         # Generate random adjacency matrix
24 |         adjacency_matrix = np.zeros((n_nodes, n_nodes))
25 |         for i in range(n_nodes):
26 |             for j in range(i+1, n_nodes):
27 |                 if random.random() < p_edge:
28 |                     weight = random.randint(1, max_weight)
29 |                     adjacency_matrix[i, j] = weight
30 |                     adjacency_matrix[j, i] = weight
31 | 
32 |         # Run the algorithm to find the partition
33 |         try:
34 |             partition = algorithm(adjacency_matrix)
35 |             # Make sure there are exactly two partitions
36 |             if len(set(partition)) != 2:
37 |                 return 0
38 |             if len(partition) != n_nodes:
39 |                 return 0
40 |             cut_weight = 0
41 |             for i in range(n_nodes):
42 |                 for j in range(i+1, n_nodes):
43 |                     if partition[i] != partition[j]:
44 |                         cut_weight += adjacency_matrix[i, j]
45 |         except Exception as e:
46 |             print("Exception:", e)
47 |             cut_weight = 0
48 | 
49 |         average_cut_weight += cut_weight / n_tests / max_weight
50 | 
51 |     return average_cut_weight
52 | 


--------------------------------------------------------------------------------
/tasks/meta_optimization/secret_seed_algorithm.py:
--------------------------------------------------------------------------------
 1 | from helpers import extract_code
 2 | 
 3 | def improve_algorithm(initial_solution, utility, language_model):
 4 |     """Improves a solution according to a utility function."""
 5 |     expertise = "You are an expert computer science researcher and programmer, especially skilled at optimizing algorithms."
 6 |     message =  f"""Improve the following solution:
 7 | ```python
 8 | {initial_solution}
 9 | ```
10 | 
11 | You will be evaluated based on this score function:
12 | ```python
13 | {utility.str}
14 | ```
15 | 
16 | You must return an improved solution. Be as creative as you can under the constraints.
17 | Your primary improvement must be novel and non-trivial. First, propose an idea, then implement it."""
18 |     n_messages = min(language_model.max_responses_per_call, utility.budget)
19 |     new_solutions = language_model.batch_prompt(expertise, [message] * n_messages, temperature=0.7)
20 |     new_solutions = extract_code(new_solutions)
21 |     print("new_solutions:", new_solutions)
22 |     best_solution = max(new_solutions, key=utility)
23 |     return best_solution


--------------------------------------------------------------------------------
/tasks/meta_optimization/secret_seed_algorithm_improved.py:
--------------------------------------------------------------------------------
 1 | from helpers import extract_code
 2 | 
 3 | def improve_algorithm(initial_solution, utility, language_model):
 4 |     """Improves a solution according to a utility function."""
 5 |     expertise = "You are an expert computer science researcher and programmer, especially skilled at optimizing algorithms."
 6 |     
 7 |     n_messages = min(language_model.max_responses_per_call, utility.budget)
 8 |     temperature_values = [0.4, 0.7, 1.0]
 9 |     solutions_cache = set()
10 |     new_solutions = []
11 |     utility_cache = {}
12 | 
13 |     def evaluate_solution(solution):
14 |         if solution not in utility_cache:
15 |             utility_cache[solution] = utility(solution)
16 |         return utility_cache[solution]
17 | 
18 |     for temp in temperature_values:
19 |         base_message =  f"""Improve the following solution:
20 | ```python
21 | {initial_solution}
22 | ```
23 | 
24 | You will be evaluated based on this score function:
25 | ```python
26 | {utility.str}
27 | ```
28 | 
29 | You must return an improved solution. Be as creative as you can under the constraints.
30 | Your primary improvement must be novel and non-trivial. Generate a solution with temperature={temp} that focuses on different aspects of optimization."""
31 |         
32 |         generated_solutions = language_model.batch_prompt(expertise, [base_message] * n_messages, temperature=temp)
33 |         generated_solutions = extract_code(generated_solutions)
34 |         
35 |         # Evaluate and sort the generated solutions by their utility score
36 |         scored_solutions = [(sol, evaluate_solution(sol)) for sol in generated_solutions if sol not in solutions_cache]
37 |         scored_solutions.sort(key=lambda x: x[1], reverse=True)
38 |         
39 |         # Keep only the top n_messages solutions
40 |         top_solutions = scored_solutions[:n_messages]
41 |         
42 |         for sol, _ in top_solutions:
43 |             new_solutions.append(sol)
44 |             solutions_cache.add(sol)
45 | 
46 |     # Dynamically adjust temperature values based on the utility scores
47 |     temperature_values = [temp * (1 + evaluate_solution(sol) / evaluate_solution(initial_solution)) for temp, sol in zip(temperature_values, new_solutions)]
48 | 
49 |     best_solution = max(new_solutions, key=evaluate_solution)
50 |     return best_solution


--------------------------------------------------------------------------------
/tasks/meta_optimization/secret_utility.py:
--------------------------------------------------------------------------------
  1 | from pebble import ThreadPool
  2 | import numpy as np
  3 | import os
  4 | import time
  5 | import traceback
  6 | from language_model import LanguageModel
  7 | from tqdm import tqdm
  8 | 
  9 | from config import config
 10 | from helpers import (
 11 |     read_file_as_str, generate_seed_algorithm, write_str_to_file,
 12 |     temp_override, end_pool_if_used, write_log
 13 | )
 14 | 
 15 | # Suppress warnings
 16 | import warnings
 17 | warnings.filterwarnings("ignore")
 18 | 
 19 | # print the current directory
 20 | base_secret_utility_str = read_file_as_str(f"tasks/{config['subtask']}/secret_utility.py")
 21 | base_utility_str = read_file_as_str(f"tasks/{config['subtask']}/utility.py")
 22 | try:
 23 |     base_algorithm_str = read_file_as_str(f"tasks/{config['subtask']}/seed_algorithm.py")
 24 | except:
 25 |     base_algorithm_str = generate_seed_algorithm(base_utility_str)
 26 |     write_str_to_file(base_algorithm_str, f"tasks/{config['subtask']}/seed_algorithm.py")
 27 | 
 28 | def get_improver(improve_str: str, utility, mode: str = "val", language_model=None):
 29 |     """
 30 |     Uses the improvement algorithm in improve_str to improve the algorithm in algorithm_str, according to the utility function.
 31 |     """
 32 |     try:
 33 |         new_improve_algorithm = temp_override(improve_str, "improve_algorithm")
 34 |         improved_algorithm_str = new_improve_algorithm(base_algorithm_str, utility, language_model)
 35 |         return improved_algorithm_str
 36 |     except Exception as e:
 37 |         print("Definition failed with exception:", e)
 38 |         print(traceback.format_exc())
 39 |         raise e
 40 | 
 41 | def pre_utility_hook(cur_utility_fn):
 42 |     cur_utility_fn.uses = 0
 43 |     language_model = LanguageModel(budget=config['language_model_call_budget'])
 44 |     return language_model
 45 | 
 46 | def create_handled_fn(fn, handle_exceptions, log_usage, pool, timeout=None, fail_value=""):
 47 |     def handled_fn(*args, **kwargs):
 48 |         try:
 49 |             if timeout is not None:
 50 |                 fn_future = pool.schedule(fn, args=args, kwargs=kwargs)
 51 |                 return fn_future.result(timeout=timeout)
 52 |             return fn(*args, **kwargs)
 53 |         except Exception as e:
 54 |             print("Exception in eval:", e)
 55 |             print(traceback.format_exc())
 56 |             if not handle_exceptions:
 57 |                 end_pool_if_used(pool, join_pools=config["join_pools"])
 58 |                 raise e
 59 |             if not log_usage:
 60 |                 end_pool_if_used(pool, join_pools=config["join_pools"])
 61 |                 return e
 62 |             return fail_value
 63 |     return handled_fn
 64 | 
 65 | def meta_utility(improve_str: str, mode: str = "val", log_usage: bool = False, handle_exceptions: bool = True):
 66 |     """
 67 |     Uses the improvement algorithm in improve_str to improve the algorithm in algorithm_str, according to the utility function.
 68 |     """
 69 |     meta_utility.uses = getattr(meta_utility, "uses", 0) + 1
 70 |     if meta_utility.uses > meta_utility.budget:
 71 |         print("Ran out of uses for meta-utility.")
 72 |         return 0
 73 |     if not improve_str:
 74 |         print(f"improve_str is {repr(improve_str)}, returning 0")
 75 |         return 0
 76 |     n_tests = config['meta_utility_tests']
 77 |     use_timeout = False
 78 |     use_parallel = False
 79 |     # We can't use parallelism if we're not using timeout
 80 |     assert not (use_parallel and not use_timeout)
 81 |     expected_utility_val = 0
 82 |     expected_utility_test = 0
 83 |     eval_idx = str(int(time.time()))
 84 |     run_id = max([results_folder for results_folder in os.listdir("results")], key=lambda x: int(x.split("_")[0]))
 85 |     utility = temp_override(base_secret_utility_str, "utility")
 86 | 
 87 |     pool = ThreadPool() if use_timeout else None
 88 |     improved_algorithm_futures = []
 89 |     improved_algorithm_strs = []
 90 |     if use_parallel:
 91 |         for test_idx in tqdm(range(n_tests)):
 92 |             language_model = pre_utility_hook(utility)
 93 |             improved_algorithm_future = pool.schedule(get_improver, (improve_str, utility, mode, language_model))
 94 |             improved_algorithm_futures.append(improved_algorithm_future)
 95 |         for improved_algorithm_future in improved_algorithm_futures:
 96 |             utility.uses = 0  # In case utility points to the same object each time
 97 |             get_improver_wrapped = create_handled_fn(improved_algorithm_future.result, handle_exceptions, log_usage, pool)()
 98 |             improved_algorithm_str = get_improver_wrapped(timeout=60 * 60)
 99 |             if isinstance(improved_algorithm_str, Exception):
100 |                 return 0
101 |             improved_algorithm_strs.append(improved_algorithm_str)
102 |     else:
103 |         for test_idx in tqdm(range(n_tests)):
104 |             language_model = pre_utility_hook(utility)
105 |             timeout = 60 * 60 if use_timeout else None
106 |             get_improver_wrapped = create_handled_fn(get_improver, handle_exceptions, log_usage, pool, timeout=timeout)
107 |             improved_algorithm_str = get_improver_wrapped(improve_str, utility, mode, language_model)
108 |             if isinstance(improved_algorithm_str, Exception):
109 |                 return 0
110 |             improved_algorithm_strs.append(improved_algorithm_str)
111 |     end_pool_if_used(pool, join_pools=config["join_pools"])
112 |     for test_idx, improved_algorithm_str in enumerate(improved_algorithm_strs):
113 |         if not improved_algorithm_str:
114 |             continue
115 |         # Save the improved algorithm to a file
116 |         # First, find the most recent folder in results
117 |         # Then, save the algorithm to that folder
118 |         time_elapsed = int(eval_idx) - int(run_id.split("_")[0])
119 |         write_str_to_file(base_algorithm_str, f"results/{run_id}/base_algorithm_{time_elapsed}_{test_idx}.py")
120 |         write_str_to_file(improved_algorithm_str, f"results/{run_id}/improved_algorithm_{time_elapsed}_{test_idx}.py")
121 |         utility.uses = 0
122 |         utility_wrapped = create_handled_fn(utility, handle_exceptions, log_usage, None, fail_value=0)
123 |         new_utility_val = utility_wrapped(improved_algorithm_str, mode="val")
124 |         if isinstance(new_utility_val, Exception):
125 |             return 0
126 |         # Also log test
127 |         if log_usage:
128 |             print("Evaluating improved algorithm on test")
129 |             utility.uses = 0
130 |             utility_wrapped = create_handled_fn(utility, handle_exceptions, log_usage, None, fail_value=0)
131 |             new_utility_test = utility_wrapped(improved_algorithm_str, mode="test")
132 |             if isinstance(new_utility_test, Exception):
133 |                 return 0
134 |             expected_utility_test += new_utility_test / n_tests
135 |         expected_utility_val += new_utility_val / n_tests
136 |     if log_usage:
137 |         write_log(expected_utility_val, expected_utility_test, run_id)
138 |     return expected_utility_val
139 | 
140 | # We're in secret_utility.py - we want the string of utility.py
141 | fake_self_str = read_file_as_str(f"tasks/{config['task']}/utility.py")
142 | meta_utility.budget = config['meta_utility_budget']
143 | meta_utility.str = fake_self_str


--------------------------------------------------------------------------------
/tasks/meta_optimization/seed_algorithm.py:
--------------------------------------------------------------------------------
 1 | from helpers import extract_code
 2 | 
 3 | def improve_algorithm(initial_solution, utility, language_model):
 4 |     """Improves a solution according to a utility function."""
 5 |     expertise = "You are an expert computer science researcher and programmer, especially skilled at optimizing algorithms."
 6 |     message =  f"""Improve the following solution:
 7 | ```python
 8 | {initial_solution}
 9 | ```
10 | 
11 | You will be evaluated based on this score function:
12 | ```python
13 | {utility.str}
14 | ```
15 | 
16 | You must return an improved solution. Be as creative as you can under the constraints.
17 | Your primary improvement must be novel and non-trivial. First, propose an idea, then implement it."""
18 |     n_messages = min(language_model.max_responses_per_call, utility.budget)
19 |     new_solutions = language_model.batch_prompt(expertise, [message] * n_messages, temperature=0.7)
20 |     new_solutions = extract_code(new_solutions)
21 |     best_solution = max(new_solutions, key=utility)
22 |     return best_solution


--------------------------------------------------------------------------------
/tasks/meta_optimization/transfer_eval.py:
--------------------------------------------------------------------------------
  1 | from pebble import ThreadPool
  2 | import numpy as np
  3 | import os
  4 | import time
  5 | import jsonlines
  6 | import traceback
  7 | from language_model import LanguageModel
  8 | from tqdm import tqdm
  9 | 
 10 | from config import config
 11 | from helpers import (
 12 |     read_file_as_str, generate_seed_algorithm, write_str_to_file,
 13 |     temp_override
 14 | )
 15 | 
 16 | # Suppress warnings
 17 | import warnings
 18 | warnings.filterwarnings("ignore")
 19 | 
 20 | start_time = time.time()
 21 | subtasks = ['three_sat']
 22 | print(subtasks)
 23 | for subtask in subtasks:
 24 |     print("=====================================")
 25 |     print("subtask:", subtask)
 26 |     if subtask == "meta_optimization":
 27 |         continue
 28 |     # print the current directory
 29 |     base_secret_utility_str = read_file_as_str(f"tasks/{subtask}/secret_utility.py")
 30 |     base_utility_str = read_file_as_str(f"tasks/{subtask}/utility.py")
 31 |     try:
 32 |         base_algorithm_str = read_file_as_str(f"tasks/{subtask}/seed_algorithm.py")
 33 |     except:
 34 |         base_algorithm_str = generate_seed_algorithm(base_utility_str)
 35 |         write_str_to_file(base_algorithm_str, f"tasks/{subtask}/seed_algorithm.py")
 36 | 
 37 |     def get_improver(improve_str: str, utility, mode: str = "val", language_model=None):
 38 |         """
 39 |         Uses the improvement algorithm in improve_str to improve the algorithm in algorithm_str, according to the utility function.
 40 |         """
 41 |         try:
 42 |             print("DEFINING ALGORITHM META")
 43 |             new_improve_algorithm = temp_override(improve_str, "improve_algorithm")
 44 |             print("DEFINED ALGORITHM META")
 45 |         except Exception as e:
 46 |             print("Definition failed with exception:", e)
 47 |             print(traceback.format_exc())
 48 |             raise e
 49 |         try:
 50 |             print("Improver:")
 51 |             print(improve_str)
 52 |             print("Base algorithm:")
 53 |             print(base_algorithm_str)
 54 |             improved_algorithm_str = new_improve_algorithm(base_algorithm_str, utility, language_model)
 55 |             print("Improved algorithm:")
 56 |             print(improved_algorithm_str)
 57 |             return improved_algorithm_str
 58 |         except Exception as e:
 59 |             print("=====================================")
 60 |             print("Improver failed with exception:", e)
 61 |             print(traceback.format_exc())
 62 |             print("Improver:")
 63 |             print(improve_str)
 64 |             print("=====================================")
 65 |             raise e
 66 | 
 67 |     def meta_utility(improve_str: str, mode: str = "val", log_usage: bool = False, handle_exceptions: bool = True):
 68 |         """
 69 |         Uses the improvement algorithm in improve_str to improve the algorithm in algorithm_str, according to the utility function.
 70 |         """
 71 |         meta_utility.uses = getattr(meta_utility, "uses", 0) + 1
 72 |         if meta_utility.uses > meta_utility.budget:
 73 |             print("Ran out of uses for meta-utility.")
 74 |             return 0
 75 |         if not improve_str:
 76 |             print(f"improve_str is {repr(improve_str)}, returning 0")
 77 |             return 0
 78 |         n_tests = config['meta_utility_tests']
 79 |         use_timeout = False
 80 |         use_parallel = False
 81 |         # We can't use parallelism if we're not using timeout
 82 |         assert not (use_parallel and not use_timeout)
 83 |         expected_utility_val = 0
 84 |         expected_utility_test = 0
 85 |         eval_idx = str(int(time.time()))
 86 |         run_id = max([results_folder for results_folder in os.listdir("results")], key=lambda x: int(x.split("_")[0]))
 87 |         utility = temp_override(base_secret_utility_str, "utility")
 88 | 
 89 |         if use_timeout:
 90 |             pool = ThreadPool()
 91 |         improved_algorithm_futures = []
 92 |         improved_algorithm_strs = []
 93 |         if use_timeout:
 94 |             if use_parallel:
 95 |                 for test_idx in range(n_tests):
 96 |                     language_model = LanguageModel(budget=config['language_model_call_budget'])
 97 |                     utility.uses = 0
 98 |                     improved_algorithm_future = pool.schedule(get_improver, (improve_str, utility, mode, language_model))
 99 |                     improved_algorithm_futures.append(improved_algorithm_future)
100 |                 for improved_algorithm_future in improved_algorithm_futures:
101 |                     try:
102 |                         timeout = 60 * 60
103 |                         improved_algorithm_str = improved_algorithm_future.result(timeout=timeout)
104 |                     except Exception as e:
105 |                         print("Exception in improving algorithm in utility:", e)
106 |                         print(traceback.format_exc())
107 |                         improved_algorithm_str = ""
108 |                         if not handle_exceptions:
109 |                             raise e
110 |                         if not log_usage:
111 |                             return 0
112 |                     improved_algorithm_strs.append(improved_algorithm_str)
113 |             else:
114 |                 for test_idx in tqdm(range(n_tests)):
115 |                     language_model = LanguageModel(budget=config['language_model_call_budget'])
116 |                     utility.uses = 0
117 |                     improved_algorithm_future = pool.schedule(get_improver, (improve_str, utility, mode, language_model))
118 |                     try:
119 |                         timeout = 60 * 60
120 |                         improved_algorithm_str = improved_algorithm_future.result(timeout=timeout)
121 |                     except Exception as e:
122 |                         print("Exception in improving algorithm in utility:", e)
123 |                         print(traceback.format_exc())
124 |                         improved_algorithm_str = ""
125 |                         if not handle_exceptions:
126 |                             raise e
127 |                         if not log_usage:
128 |                             return 0
129 |                     improved_algorithm_strs.append(improved_algorithm_str)
130 |         else:
131 |             for test_idx in tqdm(range(n_tests)):
132 |                 language_model = LanguageModel(budget=config['language_model_call_budget'])
133 |                 utility.uses = 0
134 |                 try:
135 |                     improved_algorithm_str = get_improver(improve_str, utility, mode, language_model)
136 |                 except Exception as e:
137 |                     print("Exception in improving algorithm in utility:", e)
138 |                     print(traceback.format_exc())
139 |                     improved_algorithm_str = ""
140 |                     if not handle_exceptions:
141 |                         raise e
142 |                     if not log_usage:
143 |                         return 0
144 |                 improved_algorithm_strs.append(improved_algorithm_str)
145 |         if use_timeout:
146 |             pool.stop()
147 |         # pool.join()
148 |         for test_idx, improved_algorithm_str in enumerate(improved_algorithm_strs):
149 |             if not improved_algorithm_str:
150 |                 continue
151 |             # Save the improved algorithm to a file
152 |             # First, find the most recent folder in results
153 |             # Then, save the algorithm to that folder
154 |             time_elapsed = int(eval_idx) - int(run_id.split("_")[0])
155 |             write_str_to_file(base_algorithm_str, f"results/{run_id}/base_algorithm_{time_elapsed}_{test_idx}.py")
156 |             write_str_to_file(improved_algorithm_str, f"results/{run_id}/improved_algorithm_{time_elapsed}_{test_idx}.py")
157 |             print("Evaluating improved algorithm on val")
158 |             try:
159 |                 utility.uses = 0
160 |                 new_utility_val = utility(improved_algorithm_str, mode=mode)
161 |             except Exception as e:
162 |                 print("Exception in evaluating improved algorithm val in metautil:", e)
163 |                 print(traceback.format_exc())
164 |                 new_utility_val = 0
165 |                 if not handle_exceptions:
166 |                     raise e
167 |                 if not log_usage:
168 |                     return 0
169 |             expected_utility_val += new_utility_val / n_tests
170 |             # Also log test
171 |             if log_usage:
172 |                 print("Evaluating improved algorithm on test")
173 |                 try:
174 |                     utility.uses = 0
175 |                     new_utility_test = utility(improved_algorithm_str, mode="test")
176 |                 except Exception as e:
177 |                     print("Exception in evaluating improved algorithm test in metautil:", e)
178 |                     print(traceback.format_exc())
179 |                     new_utility_test = 0
180 |                     if not handle_exceptions:
181 |                         raise e
182 |                     if not log_usage:  # Just in case we end up getting rid of that if above...
183 |                         return 0
184 |                 expected_utility_test += new_utility_test / n_tests
185 |             # Write the utility value to a file
186 |             cur_time = time.time()
187 |             base_save_filename = f"meta_utility_{start_time}_{cur_time}.jsonl"
188 |             if config['transfer_eval_type'] == 'base':
189 |                 save_filename = 'base' + base_save_filename
190 |             else:
191 |                 save_filename = 'improved' + base_save_filename
192 |                 
193 |             with jsonlines.open(save_filename, mode="a") as writer:
194 |                 writer.write({"task": subtask, "utility": new_utility_val, "utility_test": new_utility_test})
195 |         print("Expected utility val:", expected_utility_val)
196 |         if log_usage:
197 |             print("Expected utility test:", expected_utility_test)
198 | 
199 |         return expected_utility_val, expected_utility_test
200 | 
201 | 
202 |     # We're in secret_utility.py - we want the string of utility.py
203 |     fake_self_str = read_file_as_str(f"tasks/{config['task']}/utility.py")
204 |     meta_utility.budget = config['meta_utility_budget']
205 |     meta_utility.str = fake_self_str
206 | 
207 |     if config['transfer_eval_type'] == 'base':
208 |         improve_algorithm_path = "tasks/meta_optimization/secret_seed_algorithm.py"
209 |     else:
210 |         improve_algorithm_base = "tasks/meta_optimization/secret_seed_algorithm_improved.py"
211 |     with open(improve_algorithm_path, 'r') as file:
212 |         improve_algorithm_str = file.read()
213 |     
214 |     util_val, util_test = meta_utility(improve_algorithm_str, log_usage=True, handle_exceptions=True)


--------------------------------------------------------------------------------
/tasks/meta_optimization/utility.py:
--------------------------------------------------------------------------------
 1 | from algorithm import algorithm_str
 2 | from task_utility import utility
 3 | from language_model import LanguageModel
 4 | 
 5 | def meta_utility(improve_str: str):
 6 |     """
 7 |     Evaluates the algorithm in improve_str to improve the algorithm in algorithm_str, according to
 8 |     some downstream utility function. This meta-utility function can only be called 25 times.
 9 |     """
10 |     if meta_utility.uses > meta_utility.budget:
11 |         return 0
12 |     meta_utility.increment_uses()
13 |     n_tests = 5
14 |     expected_utility = 0
15 |     for _ in range(n_tests):
16 |         if utility.uses >= utility.budget:
17 |             break
18 |         try:
19 |             exec(improve_str, globals())  # Define improve_algorithm function
20 |         except:
21 |             continue
22 |         # At most 4 calls to language model, and at most 6 samples each time
23 |         language_model = LanguageModel(budget=4, max_responses_per_call=6)
24 |         improved_algorithm_str = improve_algorithm(algorithm_str, utility, language_model)
25 |         expected_utility += utility(improved_algorithm_str) / n_tests
26 | 
27 |     return expected_utility


--------------------------------------------------------------------------------
/tasks/modified_quadratic_assignment/secret_utility.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pebble import ThreadPool
 3 | from helpers import temp_override
 4 | import time
 5 | 
 6 | def utility(algorithm_str: str, mode: str = "val"):
 7 |     """
 8 |     Implements the Modified Quadratic Assignment Problem (MQAP) with n facilities/locations.
 9 |     Returns the objective value, where higher is better.
10 |     The algorithm must be extremely fast. If it takes more than 500 milliseconds to run, it is a failure.
11 |     Your algorithm function must be named 'algorithm' and take three arguments: F, D, and P, 
12 |     which are numpy arrays of shape (n, n) containing the flow, distance, and preference matrices.
13 |     """
14 |     # Uncomment to limit the number of times the algorithm can be used
15 |     # uses = getattr(utility, "uses", 0)
16 |     # if uses >= utility.budget:
17 |     #     return 0
18 |     # if not algorithm_str:
19 |     #     print(f"algorithm_str is {repr(algorithm_str)}, returning 0")
20 |     #     return 0
21 |     # utility.uses = uses + 1
22 | 
23 |     n_tests = 10
24 |     n = 15  # Number of facilities and locations
25 |     lambda_value = 0.5  # Preference weight
26 |     average_objective = 0
27 |     base_seed = 4321 if mode == "val" else 5678
28 |     pool = ThreadPool()
29 |     eps = 1e-2
30 |     scale = n * n
31 | 
32 |     try:
33 |         algorithm = temp_override(algorithm_str, "algorithm")
34 |     except:
35 |         return eps
36 | 
37 |     for test_idx in range(n_tests):
38 |         np.random.seed(base_seed + test_idx)  # Consistent seeding for evaluation
39 |         F = np.random.rand(n, n)
40 |         D = np.random.rand(n, n)
41 |         P = np.random.rand(n, n)
42 |         
43 |         try:
44 |             start_time = time.time()
45 |             assignment_future = pool.schedule(algorithm, (F, D, P))
46 |             assignment = assignment_future.result(timeout=0.5)
47 |             total_time = time.time() - start_time
48 | 
49 |             if set(assignment) == set(range(n)):
50 |                 objective = sum(F[i, j] * D[assignment[i], assignment[j]] for i in range(n) for j in range(n))
51 |                 objective -= lambda_value * sum(P[i, assignment[i]] for i in range(n))
52 |                 objective += total_time
53 |             else:
54 |                 objective = 0.0
55 | 
56 |             average_objective += objective / n_tests
57 |         except Exception as e:
58 |             average_objective += 0.0
59 | 
60 |     return max(average_objective / scale, eps)
61 | 
62 | from config import config
63 | from helpers import read_file_as_str
64 | import os
65 | utility.budget = config["utility_budget"]
66 | fake_self_str = read_file_as_str(f"tasks/modified_quadratic_assignment/utility.py")
67 | utility.str = fake_self_str
68 | utility.uses = 0


--------------------------------------------------------------------------------
/tasks/modified_quadratic_assignment/seed_algorithm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from random import randint, random
 3 | from copy import deepcopy
 4 | 
 5 | def algorithm(F, D, P):
 6 |     def mqap_objective(assignment):
 7 |         objective = sum(F[i, j] * D[assignment[i], assignment[j]] for i in range(n) for j in range(n))
 8 |         objective -= lambda_value * sum(P[i, assignment[i]] for i in range(n))
 9 |         return objective
10 | 
11 |     def swap_random(assignment):
12 |         i, j = randint(0, n - 1), randint(0, n - 1)
13 |         while i == j:
14 |             j = randint(0, n - 1)
15 |         assignment[i], assignment[j] = assignment[j], assignment[i]
16 | 
17 |     n = len(F)
18 |     lambda_value = 0.5
19 |     max_iterations = 1000
20 |     temperature = 1.0
21 |     cooling_rate = 0.99
22 | 
23 |     assignment = list(range(n))
24 |     best_assignment = deepcopy(assignment)
25 |     best_objective = mqap_objective(assignment)
26 | 
27 |     for _ in range(max_iterations):
28 |         temperature *= cooling_rate
29 |         if temperature == 0:
30 |             break
31 | 
32 |         new_assignment = deepcopy(assignment)
33 |         swap_random(new_assignment)
34 |         new_objective = mqap_objective(new_assignment)
35 |         delta_objective = new_objective - mqap_objective(assignment)
36 | 
37 |         if delta_objective < 0 or random() < np.exp(-delta_objective / temperature):
38 |             assignment = new_assignment
39 | 
40 |             if new_objective < best_objective:
41 |                 best_assignment = deepcopy(assignment)
42 |                 best_objective = new_objective
43 | 
44 |     return best_assignment


--------------------------------------------------------------------------------
/tasks/modified_quadratic_assignment/utility.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pebble import ThreadPool
 3 | from helpers import temp_override
 4 | import time
 5 | 
 6 | def utility(algorithm_str: str):
 7 |     """
 8 |     Implements the Modified Quadratic Assignment Problem (MQAP) with n facilities/locations.
 9 |     Returns the objective value, where higher is better.
10 |     The algorithm must be extremely fast. If it takes more than 500 milliseconds to run, it is a failure.
11 |     Your algorithm function must be named 'algorithm' and take three arguments: F, D, and P, 
12 |     which are numpy arrays of shape (n, n) containing the flow, distance, and preference matrices.
13 |     """
14 |     n_tests = 20
15 |     n = 15  # Number of facilities and locations
16 |     lambda_value = 0.5  # Preference weight
17 |     average_objective = 0
18 |     pool = ThreadPool()
19 | 
20 |     try:
21 |         exec(algorithm_str, globals())
22 |     except:
23 |         return 0
24 | 
25 |     for test_idx in range(n_tests):
26 |         F = np.random.rand(n, n)
27 |         D = np.random.rand(n, n)
28 |         P = np.random.rand(n, n)
29 |         
30 |         try:
31 |             start_time = time.time()
32 |             assignment_future = pool.schedule(algorithm, (F, D, P))
33 |             assignment = assignment_future.result(timeout=0.01)
34 |             total_time = time.time() - start_time
35 | 
36 |             if set(assignment) == set(range(n)):
37 |                 objective = sum(F[i, j] * D[assignment[i], assignment[j]] for i in range(n) for j in range(n))
38 |                 objective -= lambda_value * sum(P[i, assignment[i]] for i in range(n))
39 |                 objective += total_time
40 |             else:
41 |                 objective = 0
42 | 
43 |             average_objective += objective / n_tests
44 |         except Exception as e:
45 |             average_objective += 0
46 | 
47 |     return average_objective
48 | 


--------------------------------------------------------------------------------
/tasks/parity_noise/secret_utility.py:
--------------------------------------------------------------------------------
 1 | from pebble import ThreadPool
 2 | from helpers import temp_override, read_file_as_str
 3 | import multiprocess
 4 | import random
 5 | import numpy as np
 6 | import time
 7 | from config import config
 8 | 
 9 | def utility(algorithm_str: str, mode: str = "val"):
10 |     """
11 |     Implements the parity learning task. Returns the number of correct predictions.
12 |     """
13 |     uses = getattr(utility, "uses", 0)
14 |     if uses >= utility.budget:
15 |         return 0
16 |     if not algorithm_str:
17 |         print(f"algorithm_str is {repr(algorithm_str)}, returning 0")
18 |         return 0
19 |     utility.uses = uses + 1
20 | 
21 |     if mode == "test":
22 |         n_tests = 50
23 |     else:
24 |         n_tests = 20
25 |     average_correct = 0
26 |     eps = 1e-6
27 |     base_seed = 4321 if mode == "val" else 5678
28 |     pool = ThreadPool()
29 | 
30 |     try:
31 |         algorithm = temp_override(algorithm_str, "algorithm")
32 |     except Exception as e:
33 |         print(e.__class__.__name__, "Exception in utility:", e)
34 |         print("algorithm_str:", algorithm_str)
35 |         pool.stop()
36 |         if config['join_pools']:
37 |             pool.join()
38 |         return 0
39 | 
40 |     for test_idx in range(n_tests):
41 |         np.random.seed(base_seed + test_idx)
42 |         random.seed(base_seed + test_idx)
43 | 
44 |         n_bits = 10
45 |         p_true = 0.3
46 |         n_train_samples = 100
47 |         n_test_samples = 20
48 |         noise_level = 0.05
49 |         true_bits = np.random.binomial(1, p_true, n_bits)
50 |         
51 |         samples = np.random.binomial(1, 0.5, (n_train_samples + n_test_samples, n_bits))
52 |         masked_samples = samples * true_bits
53 |         parity = np.sum(masked_samples, axis=1) % 2
54 |         train_samples = samples[:n_train_samples]
55 |         train_parity = parity[:n_train_samples]
56 |         parity_noise = np.random.binomial(1, noise_level, n_train_samples)
57 |         train_parity = (train_parity + parity_noise) % 2
58 | 
59 |         test_samples = samples[n_train_samples:]
60 |         test_parity = parity[n_train_samples:]
61 | 
62 |         # Because algorithm is a string, we can't call it directly. Instead, we can use eval to evaluate it as a Python expression.
63 |         try:
64 |             timeout = 2
65 |             start_time = time.time()
66 |             predictions_future = pool.schedule(algorithm, (train_samples, train_parity, test_samples))
67 |             predictions = predictions_future.result(timeout=timeout)
68 |             end_time = time.time()
69 |             if end_time - start_time > timeout:
70 |                 pool.stop()
71 |                 if config['join_pools']:
72 |                     pool.join()
73 |                 print("Timeout in utility, returning 0")
74 |                 return eps
75 |             # Make them both row vectors
76 |             predictions = np.array(predictions).reshape(-1)
77 |             test_parity = np.array(test_parity).reshape(-1)
78 |             correct = np.sum(predictions == test_parity) / n_test_samples
79 |         except Exception as e:
80 |             print(e.__class__.__name__, "Exception in utility:", e)
81 |             pool.stop()
82 |             if config['join_pools']:
83 |                 pool.join()
84 |             return eps
85 |         average_correct += correct / n_tests
86 |     print("average_correct:", average_correct)
87 |     pool.stop()
88 |     if config['join_pools']:
89 |         pool.join()
90 |     return average_correct
91 | 
92 | utility.budget = config["utility_budget"]
93 | fake_self_str = read_file_as_str(f"tasks/parity_noise/utility.py")
94 | utility.str = fake_self_str
95 | utility.uses = 0


--------------------------------------------------------------------------------
/tasks/parity_noise/seed_algorithm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | def algorithm(train_samples, train_parity, test_samples):
4 |     predictions = np.random.binomial(1, 0.5, len(test_samples))
5 |     return predictions


--------------------------------------------------------------------------------
/tasks/parity_noise/utility.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import time
 4 | 
 5 | def utility(algorithm_str: str):
 6 |     """
 7 |     Implements the parity learning task. Returns the number of correct predictions.
 8 |     """
 9 | 
10 |     n_tests = 3
11 |     average_correct = 0
12 | 
13 |     try:
14 |         exec(algorithm_str, globals())
15 |     except:
16 |         return 0
17 | 
18 |     for _ in range(n_tests):
19 |         start_time = time.time()
20 |         n_bits = 10
21 |         p_true = 0.3
22 |         n_train_samples = 100
23 |         n_test_samples = 20
24 |         noise_level = 0.05
25 |         true_bits = np.random.binomial(1, p_true, n_bits)
26 |         
27 |         samples = np.random.binomial(1, 0.5, (n_train_samples + n_test_samples, n_bits))
28 |         masked_samples = samples * true_bits
29 |         parity = np.sum(masked_samples, axis=1) % 2
30 |         train_samples = samples[:n_train_samples]
31 |         train_parity = parity[:n_train_samples]
32 |         parity_noise = np.random.binomial(1, noise_level, n_train_samples)
33 |         train_parity = (train_parity + parity_noise) % 2
34 | 
35 |         test_samples = samples[n_train_samples:]
36 |         test_parity = parity[n_train_samples:]
37 | 
38 |         # Because algorithm is a string, we can't call it directly. Instead, we can use eval to evaluate it as a Python expression.
39 |         try:
40 |             predictions = algorithm(train_samples, train_parity, test_samples)
41 |             test_parity = np.array(test_parity).reshape(-1)
42 |             predictions = np.array(predictions).reshape(-1)
43 |             correct = np.sum(predictions == test_parity) / n_test_samples
44 |         except:
45 |             correct = 0
46 |         # Use no more than 100 milliseconds per test
47 |         if time.time() - start_time > 0.1:
48 |             return 0
49 |         average_correct += correct / n_tests
50 | 
51 |     return average_correct


--------------------------------------------------------------------------------
/tasks/parity_noiseless/secret_utility.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | from helpers import temp_override, read_file_as_str
 4 | from config import config
 5 | 
 6 | def utility(algorithm_str: str, mode: str = "val"):
 7 |     """
 8 |     Implements the parity learning task. Returns the number of correct predictions.
 9 |     """
10 | 
11 |     uses = getattr(utility, "uses", 0)
12 |     if uses >= utility.budget:
13 |         return 0
14 |     if not algorithm_str:
15 |         print(f"algorithm_str is {repr(algorithm_str)}, returning 0")
16 |         return 0
17 |     utility.uses = uses + 1
18 | 
19 |     n_tests = 20
20 |     average_correct = 0
21 |     base_seed = 4321 if mode == "val" else 5678
22 | 
23 |     try:
24 |         algorithm = temp_override(algorithm_str, "algorithm")
25 |     except Exception as e:
26 |         return 0
27 | 
28 |     for test_idx in range(n_tests):
29 |         np.random.seed(base_seed + test_idx)
30 |         random.seed(base_seed + test_idx)
31 | 
32 |         n_bits = 10
33 |         p_true = 0.3
34 |         n_train_samples = 100
35 |         n_test_samples = 20
36 |         true_bits = np.random.binomial(1, p_true, n_bits)
37 |         
38 |         samples = np.random.binomial(1, 0.5, (n_train_samples + n_test_samples, n_bits))
39 |         masked_samples = samples * true_bits
40 |         parity = np.sum(masked_samples, axis=1) % 2
41 |         train_samples = samples[:n_train_samples]
42 |         train_parity = parity[:n_train_samples]
43 | 
44 |         test_samples = samples[n_train_samples:]
45 |         test_parity = parity[n_train_samples:]
46 | 
47 |         # Because algorithm is a string, we can't call it directly. Instead, we can use eval to evaluate it as a Python expression.
48 |         try:
49 |             predictions = algorithm(train_samples, train_parity, test_samples)
50 |             correct = np.sum(predictions == test_parity) / n_test_samples
51 |         except Exception as e:
52 |             print("Exception:", e)
53 |             correct = 0
54 |         average_correct += correct / n_tests
55 |     return average_correct
56 | 
57 | utility.budget = config["utility_budget"]
58 | # get the name of the file's directory
59 | fake_self_str = read_file_as_str(f"tasks/parity_noiseless/utility.py")
60 | utility.str = fake_self_str
61 | utility.uses = 0


--------------------------------------------------------------------------------
/tasks/parity_noiseless/seed_algorithm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | def algorithm(train_samples, train_parity, test_samples):
4 |     predictions = np.random.binomial(1, 0.5, len(test_samples))
5 |     return predictions


--------------------------------------------------------------------------------
/tasks/parity_noiseless/utility.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | def utility(algorithm_str: str):
 5 |     """
 6 |     Implements the parity learning task. Returns the number of correct predictions.
 7 |     """
 8 | 
 9 |     n_tests = 3
10 |     average_correct = 0
11 | 
12 |     try:
13 |         exec(algorithm_str, globals())
14 |     except:
15 |         return 0
16 | 
17 |     for _ in range(n_tests):
18 |         n_bits = 10
19 |         p_true = 0.3
20 |         n_train_samples = 80
21 |         n_test_samples = 20
22 |         true_bits = np.random.binomial(1, p_true, n_bits)
23 |         
24 |         samples = np.random.binomial(1, 0.5, (n_train_samples + n_test_samples, n_bits))
25 |         masked_samples = samples * true_bits
26 |         parity = np.sum(masked_samples, axis=1) % 2
27 |         train_samples = samples[:n_train_samples]
28 |         train_parity = parity[:n_train_samples]
29 | 
30 |         test_samples = samples[n_train_samples:]
31 |         test_parity = parity[n_train_samples:]
32 | 
33 |         # Because algorithm is a string, we can't call it directly. Instead, we can use eval to evaluate it as a Python expression.
34 |         try:
35 |             predictions = algorithm(train_samples, train_parity, test_samples)
36 |             correct = np.sum(predictions == test_parity) / n_test_samples
37 |         except:
38 |             correct = 0
39 |         average_correct += correct / n_tests
40 | 
41 |     return average_correct


--------------------------------------------------------------------------------
/tasks/sandbox/seed_algorithm.py:
--------------------------------------------------------------------------------
 1 | from helpers import extract_code
 2 | 
 3 | def improve_algorithm(initial_solution, utility, language_model):
 4 |     """Improves a solution according to a utility function."""
 5 |     expertise = "You are an expert computer science researcher and programmer, especially skilled at optimizing algorithms."
 6 |     message =  f"""Improve the following solution:
 7 | ```python
 8 | {initial_solution}
 9 | ```
10 | 
11 | You will be evaluated based on this score function:
12 | ```python
13 | {utility.str}
14 | ```
15 | 
16 | You must return an improved solution. Be as creative as you can under the constraints.
17 | Your primary improvement must be novel and non-trivial. First, propose an idea, then implement it."""
18 |     n_messages = min(language_model.max_responses_per_call, utility.budget)
19 |     new_solutions = language_model.batch_prompt(expertise, [message] * n_messages, temperature=0.7)
20 |     new_solutions = extract_code(new_solutions)
21 |     best_solution = None
22 |     for solution in new_solutions:
23 |         score = utility(solution, use_sandbox=True)
24 |         if best_solution is None or score > best_score:
25 |             best_score = score
26 |             best_solution = solution
27 |     return best_solution


--------------------------------------------------------------------------------
/tasks/sandbox/utility.py:
--------------------------------------------------------------------------------
 1 | from algorithm import algorithm_str
 2 | from task_utility import utility
 3 | from language_model import LanguageModel
 4 | from run import run
 5 | 
 6 | def meta_utility(improve_str: str, use_sandbox: bool):
 7 |     """
 8 |     Evaluates the algorithm in improve_str to improve the algorithm in algorithm_str, according to
 9 |     some downstream utility function. This meta-utility function can only be called 25 times.
10 |     """
11 |     if meta_utility.uses > meta_utility.budget:
12 |         return 0
13 |     meta_utility.increment_uses()
14 |     n_tests = 5
15 |     expected_utility = 0
16 |     for _ in range(n_tests):
17 |         if utility.uses >= utility.budget:
18 |             break
19 |         try:
20 |             run(improve_str, globals(), use_sandbox=use_sandbox)  # Define improve_algorithm function
21 |         except:
22 |             continue
23 |         # At most 5 calls to language model, and at most 5 samples each time
24 |         language_model = LanguageModel(budget=5, max_responses_per_call=5)
25 |         improved_algorithm_str = improve_algorithm(algorithm_str, utility, language_model)
26 |         expected_utility += utility(improved_algorithm_str, use_sandbox=use_sandbox) / n_tests
27 | 
28 |     return expected_utility


--------------------------------------------------------------------------------
/tasks/str_grid_dist/secret_utility.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import time
 3 | import numpy as np
 4 | from pebble import ThreadPool
 5 | from helpers import temp_override, read_file_as_str
 6 | from config import config
 7 | 
 8 | def utility(algorithm_str: str, mode: str = "val"):
 9 |     """
10 |     Implements the str_grid_dist task. Returns a value between -1 and 1.
11 |     """
12 |     # Uncomment to limit the number of times the algorithm can be called
13 |     # uses = getattr(utility, "uses", 0)
14 |     # if uses >= utility.budget:
15 |     #     return 0
16 |     # if not algorithm_str:
17 |     #     print(f"algorithm_str is {repr(algorithm_str)}, returning 0")
18 |     #     return 0
19 |     # utility.uses = uses + 1
20 | 
21 |     base_seed = 4321 if mode == "val" else 5678
22 |     pool = ThreadPool()
23 |     try:
24 |         algorithm = temp_override(algorithm_str, "algorithm")
25 |     except:
26 |         pool.stop()
27 |         if config['join_pools']:
28 |             pool.join()
29 |         return 0.0
30 | 
31 |     scores = []    
32 |     for test_idx in range(50):
33 |         np.random.seed(base_seed + test_idx)
34 |         random.seed(base_seed + test_idx)
35 |         length = random.randint(1, 30)
36 |         t = "".join(random.choice("AB") for _ in range(length))
37 |         s = "".join(random.choice("AB") for _ in range(length))
38 |         dist = grid_dist(s, t)
39 | 
40 |         # Because algorithm is a string, we can't call it directly. Instead, we can use eval to evaluate it as a Python expression.
41 |         try:
42 |             timeout = 1
43 |             predictions_future = pool.schedule(score_test, (t, dist, algorithm))
44 |             predictions = predictions_future.result(timeout=timeout)
45 |             scores.append(predictions)
46 |         except Exception as e:
47 |             print("Exception in utility:", e)
48 |             print(e.__class__.__name__)
49 |             scores.append(0.0)
50 |     pool.stop()
51 |     if config['join_pools']:
52 |         pool.join()
53 |     return sum(scores) / len(scores)
54 |         
55 | def grid_dist(s: str, t: str):
56 |     assert isinstance(s, str) and isinstance(t, str) and len(s) == len(t) and set(s + t) <= set("AB")
57 |     ans = sum(a != b for a, b in zip(s, t))
58 |     ans += sum(a != b for a, b in zip(s, s[1:]))
59 |     ans += sum(a != b for a, b in zip(t, t[1:]))
60 |     return ans
61 | 
62 | 
63 | def score_test(t: str, dist: int, find_at_dist: callable, max_time=0.1) -> float:
64 |     start_time = time.time()        
65 |     try:
66 |         s = find_at_dist(t, dist)
67 |         d = grid_dist(s, t)
68 |         if time.time() - start_time > max_time:
69 |             return 0
70 |         if d == dist:
71 |             return 1.0  # perfect!
72 |         else:
73 |             return 0.5 - abs(d - dist)/(6*len(t)) # between 0 and 0.5
74 |     except:
75 |         return 0  # error
76 | 
77 | utility.budget = config["utility_budget"]
78 | # get the name of the file's directory
79 | fake_self_str = read_file_as_str(f"tasks/str_grid_dist/utility.py")
80 | utility.str = fake_self_str
81 | utility.uses = 0


--------------------------------------------------------------------------------
/tasks/str_grid_dist/seed_algorithm.py:
--------------------------------------------------------------------------------
1 | def algorithm(t: str, dist: int):
2 |     return t


--------------------------------------------------------------------------------
/tasks/str_grid_dist/utility.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import time
 3 | 
 4 | def utility(algorithm_str: str):
 5 |     """Implements the str_grid_dist task. Returns a value between 0 and 1."""
 6 | 
 7 |     try:
 8 |         exec(algorithm_str, globals())
 9 |     except:
10 |         return 0.0
11 | 
12 |     scores = []    
13 |     for _ in range(10):
14 |         length = random.randint(1, 30)
15 |         t = "".join(random.choice("AB") for _ in range(length))
16 |         s = "".join(random.choice("AB") for _ in range(length))
17 |         dist = grid_dist(s, t)
18 |         scores.append(score_test(t, dist, algorithm))
19 |     return sum(scores) / len(scores)
20 |         
21 | def grid_dist(s: str, t: str):
22 |     assert isinstance(s, str) and isinstance(t, str) and len(s) == len(t) and set(s + t) <= set("AB")
23 |     ans = sum(a != b for a, b in zip(s, t))
24 |     ans += sum(a != b for a, b in zip(s, s[1:]))
25 |     ans += sum(a != b for a, b in zip(t, t[1:]))
26 |     return ans
27 | 
28 | 
29 | def score_test(t: str, dist: int, find_at_dist: callable, max_time=0.1) -> float:
30 |     start_time = time.time()        
31 |     try:
32 |         s = find_at_dist(t, dist)
33 |         d = grid_dist(s, t)
34 |         if time.time() - start_time > max_time:
35 |             return 0.0
36 |         if d == dist:
37 |             return 1.0  # perfect!
38 |         else:
39 |             return 0.5 - abs(d - dist)/(6*len(t)) # between 0 and 0.5
40 |     except:
41 |         return 0.0  # error


--------------------------------------------------------------------------------
/tasks/three_sat/secret_utility.py:
--------------------------------------------------------------------------------
  1 | from pebble import ThreadPool
  2 | from tqdm import tqdm
  3 | from helpers import temp_override
  4 | import numpy as np
  5 | import random
  6 | import time
  7 | from helpers import read_file_as_str
  8 | 
  9 | def generate_3sat_formula(n, m):
 10 |     """Generate a random 3-SAT formula with n variables and m clauses."""
 11 |     formula = []
 12 |     valid_assignment = [False] + [random.random() < 0.5 for _ in range(n)]
 13 |     for _ in range(m ** 2):
 14 |         clause = random.sample(range(1, n + 1), 3)
 15 |         clause = [var if random.random() < 0.5 else -var for var in clause]
 16 |         # Check if the clause is satisfied by the valid assignment
 17 |         if any((valid_assignment[abs(lit)] > 0) == (lit > 0) for lit in clause):
 18 |             formula.append(clause)
 19 |         if len(formula) == m:
 20 |             break
 21 |     random.shuffle(formula)
 22 |     return formula
 23 | 
 24 | def check_3sat_formula(formula, assignment):
 25 |     success = all(any((assignment[abs(lit)] > 0) == (lit > 0) for lit in clause) for clause in formula)
 26 |     return success
 27 | 
 28 | def utility(algorithm_str: str, mode: str = "val"):
 29 |     """
 30 |     Implements the Random 3-SAT problem with n variables and m clauses.
 31 |     Returns the fraction of formulas solved successfully within the time limit.
 32 |     The algorithm must be extremely fast. If it takes more than 10 milliseconds to run, it is a failure.
 33 |     Your algorithm function must be named 'algorithm' and take a single argument, formula,
 34 |     which is a list of m clauses, each containing exactly 3 literals.
 35 |     """
 36 |     # Uncomment to limit the number of times the algorithm can be used
 37 |     # uses = getattr(utility, "uses", 0)
 38 |     # if uses >= utility.budget:
 39 |     #     return 0
 40 |     # if not algorithm_str:
 41 |     #     print(f"algorithm_str is {repr(algorithm_str)}, returning 0")
 42 |     #     return 0
 43 |     # utility.uses = uses + 1
 44 | 
 45 |     n_tests = 30
 46 |     min_n = 5  # Min number of variables
 47 |     max_n = 50  # Max number of variables
 48 |     solved_count = 0
 49 |     base_seed = 4321 if mode == "val" else 5678
 50 |     timeout = 0.1
 51 |     eps = 1e-2
 52 |     join_pool = False
 53 |     
 54 |     pool = ThreadPool()
 55 | 
 56 |     try:
 57 |         algorithm = temp_override(algorithm_str, "algorithm")
 58 |     except:
 59 |         pool.stop()
 60 |         if join_pool:
 61 |             pool.join()
 62 |         return eps
 63 | 
 64 |     for test_idx in tqdm(range(n_tests)):
 65 |         random.seed(base_seed + test_idx)  # Consistent seeding for evaluation
 66 |         n = random.randint(min_n, max_n)
 67 |         m = int(4 * n)  # Number of clauses (change 4 to a different number to adjust difficulty)
 68 |         formula = generate_3sat_formula(n, m)
 69 |         try:
 70 |             formula_copy = formula.copy()
 71 |             time_start = time.time()
 72 |             if isinstance(pool, ThreadPool):
 73 |                 assignment_future = pool.schedule(algorithm, (formula_copy,))
 74 |             else:
 75 |                 assignment_future = pool.schedule(algorithm, (formula_copy,), timeout=timeout)
 76 |             assignment = assignment_future.result(timeout=timeout)
 77 |             time_end = time.time()
 78 |             if time_end - time_start > timeout:
 79 |                 solved_count += eps
 80 |                 continue
 81 |             # Validate the solution
 82 |             if check_3sat_formula(formula, assignment):
 83 |                 solved_count += 1
 84 |             else:
 85 |                 solved_count += eps
 86 |         except Exception as e:
 87 |             if not isinstance(e, TimeoutError):
 88 |                 pool.stop()
 89 |                 return eps
 90 | 
 91 |     pool.stop()
 92 |     if join_pool:
 93 |         pool.join()
 94 |     print(f"average_correct: {solved_count / n_tests}")
 95 |     return max(solved_count / n_tests, eps)
 96 | 
 97 | from config import config
 98 | from helpers import read_file_as_str
 99 | import os
100 | utility.budget = config["utility_budget"]
101 | # get the name of the file's directory
102 | fake_self_str = read_file_as_str(f"tasks/three_sat/utility.py")
103 | utility.str = fake_self_str
104 | utility.uses = 0


--------------------------------------------------------------------------------
/tasks/three_sat/seed_algorithm.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | def random_walk_solver(formula, max_iter, p):
 4 |     n = max(abs(lit) for clause in formula for lit in clause)
 5 |     assignments = [False] * (n + 1)
 6 |     
 7 |     for _ in range(max_iter):
 8 |         unsatisfied_clauses = [clause for clause in formula if not any(assignments[abs(lit)] == (lit > 0) for lit in clause)]
 9 |         
10 |         if not unsatisfied_clauses:
11 |             return assignments
12 |         
13 |         clause_to_flip = random.choice(unsatisfied_clauses)
14 |         if random.random() < p:
15 |             lit_to_flip = random.choice(clause_to_flip)
16 |         else:
17 |             lit_to_flip = min(clause_to_flip, key=lambda lit: sum(assignments[abs(lit)] == (lit > 0) for clause in formula if lit in clause))
18 |         
19 |         assignments[abs(lit_to_flip)] = not assignments[abs(lit_to_flip)]
20 |     
21 |     return None
22 | 
23 | def algorithm(formula):
24 |     return random_walk_solver(formula, max_iter=1000, p=0.4)


--------------------------------------------------------------------------------
/tasks/three_sat/utility.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | from pebble import ThreadPool
 4 | from helpers import temp_override
 5 | import time
 6 | 
 7 | def generate_3sat_formula(n, m):
 8 |     """Generate a random 3-SAT formula with n variables and m clauses."""
 9 |     formula = []
10 |     valid_assignment = [False] + [random.random() < 0.5 for _ in range(n)]
11 |     while len(formula) < m:
12 |         clause = random.sample(range(1, n + 1), 3)
13 |         clause = [var if random.random() < 0.5 else -var for var in clause]
14 |         # Check if the clause is satisfied by the valid assignment
15 |         if any((valid_assignment[abs(lit)] > 0) == (lit > 0) for lit in clause):
16 |             formula.append(clause)
17 |     return formula
18 | 
19 | def check_3sat_formula(formula, assignment):
20 |     return all(any((assignment[abs(lit)] > 0) == (lit > 0) for lit in clause) for clause in formula)
21 | 
22 | def utility(algorithm_str: str):
23 |     """
24 |     Implements the Random 3-SAT problem with n variables and m clauses.
25 |     Returns the fraction of formulas solved successfully within the time limit.
26 |     The algorithm must be extremely fast. If it takes more than 10 milliseconds to run, it is a failure.
27 |     Your algorithm function must be named 'algorithm' and take a single argument, formula
28 |     which is a list of m clauses, each containing exactly 3 literals.
29 |     """
30 |     n_tests = 100
31 |     n = 50  # Number of variables
32 |     m = int(4 * n)  # Number of clauses
33 |     solved_count = 0
34 |     pool = ThreadPool()
35 | 
36 |     try:
37 |         exec(algorithm_str, globals())
38 |     except:
39 |         return 0
40 | 
41 |     for test_idx in range(n_tests):
42 |         formula = generate_3sat_formula(n, m)
43 |         try:
44 |             assignment_future = pool.schedule(algorithm, (formula,))
45 |             assignment = assignment_future.result(timeout=0.01)
46 |             if check_3sat_formula(formula, assignment):
47 |                 solved_count += 1
48 |         except Exception as e:
49 |             return 0
50 | 
51 |     return solved_count / n_tests


--------------------------------------------------------------------------------