├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── alpha_codium ├── __init__.py ├── code_contests │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── prepare_and_clean_dataset.py │ │ ├── provider.py │ │ └── yaml_vs_json.py │ └── eval │ │ ├── __init__.py │ │ ├── code_contests_metric.py │ │ ├── code_test_runners.py │ │ ├── local_exec.py │ │ ├── pass_at_k_evaluator.py │ │ └── tracer.py ├── evaluate_dataset.py ├── gen │ ├── __init__.py │ ├── coding_competitor.py │ ├── dataset_solver.py │ ├── example.log │ ├── generators.py │ ├── stages │ │ ├── indirect │ │ │ ├── run_analyze_and_fix_test_failure.py │ │ │ ├── run_analyze_tests_failure.py │ │ │ ├── run_fix_code_from_tests_failure.py │ │ │ ├── run_fix_self_reflect.py │ │ │ └── run_validate_ai_test.py │ │ ├── run_baseline.py │ │ ├── run_choose_best_solution.py │ │ ├── run_evaluate_all_ai_tests.py │ │ ├── run_evaluate_public_tests.py │ │ ├── run_generate_ai_test.py │ │ ├── run_generate_possible_solutions.py │ │ ├── run_initial_code_generation.py │ │ ├── run_initial_solve.py │ │ ├── run_self_reflect.py │ │ ├── run_tests.py │ │ └── utils.py │ └── utils.py ├── litellm │ └── proxy │ │ └── _types.py ├── llm │ ├── __init__.py │ ├── ai_handler.py │ ├── ai_invoker.py │ └── token_handler.py ├── log │ └── __init__.py ├── settings │ ├── .secrets_template.toml │ ├── choose_best_solution_direct.toml │ ├── code_contests_prompt_analyze_and_fix.toml │ ├── code_contests_prompt_analyze_and_fix_direct.toml │ ├── code_contests_prompt_analyze_failure.toml │ ├── code_contests_prompts_baseline.toml │ ├── code_contests_prompts_choose_best_solution.toml │ ├── code_contests_prompts_fix_solution.toml │ ├── code_contests_prompts_generate_ai_tests.toml │ ├── code_contests_prompts_generate_possible_solutions.toml │ ├── code_contests_prompts_reflect.toml │ ├── code_contests_prompts_solve.toml │ ├── code_contests_prompts_solve_direct.toml │ ├── code_contests_prompts_validate_ai_tests.toml │ ├── code_contests_prompts_validate_reflection.toml │ ├── config_loader.py │ └── configuration.toml ├── solve_dataset.py ├── solve_my_problem.py └── solve_problem.py ├── docs ├── docs │ ├── CNAME │ ├── assets │ │ ├── favicon.ico │ │ ├── logo.png │ │ └── logo.svg │ ├── css │ │ └── custom.css │ └── index.md ├── mkdocs.yml └── overrides │ ├── main.html │ └── partials │ ├── footer.html │ └── integrations │ └── analytics │ └── custom.html ├── my_problem_example.json ├── pics ├── comparison.png ├── computational_effort.png ├── example_problem.png ├── iterations.png └── proposed_flow.png ├── requirements.txt └── tests ├── __init__.py └── alpha_codium ├── __init__.py └── code_contests ├── __init__.py └── eval ├── __init__.py └── test_local_exec.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set default behavior to automatically normalize line endings. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted to native line endings on checkout. 5 | *.py text eol=lf 6 | 7 | # Declare files that will always have CRLF line endings on checkout. 8 | *.bat text eol=crlf 9 | 10 | # Declare files that should never be normalized. 11 | *.jpg -text 12 | *.png -text 13 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: docs-ci 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - add-docs-portal 7 | permissions: 8 | contents: write 9 | jobs: 10 | deploy: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Configure Git Credentials 15 | run: | 16 | git config user.name github-actions[bot] 17 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 18 | - uses: actions/setup-python@v5 19 | with: 20 | python-version: 3.x 21 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 22 | - uses: actions/cache@v4 23 | with: 24 | key: mkdocs-material-${{ env.cache_id }} 25 | path: .cache 26 | restore-keys: | 27 | mkdocs-material- 28 | - run: pip install mkdocs-material 29 | - run: pip install "mkdocs-material[imaging]" 30 | - run: mkdocs gh-deploy -f docs/mkdocs.yml --force -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # MacOS 2 | .DS_Store 3 | 4 | # Python virtual environment 5 | venv/ 6 | 7 | # codecontests datasets 8 | # https://huggingface.co/datasets/talrid/CodeContests_valid_and_test_AlphaCodium/blob/main/codecontests_valid_and_test_processed_alpha_codium.zip 9 | valid_and_test_processed/ 10 | 11 | # Python cache files 12 | **/__pycache__/ 13 | 14 | # Cache files generated during documentation builds 15 | docs/.cache/ 16 | 17 | # IDEA project-specific settings and configuration files 18 | .idea/ 19 | 20 | alpha_codium/settings/.secrets.toml 21 | dataset_output.json 22 | example.log 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | ENV TZ=UTC 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 5 | 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | 8 | RUN apt-get update && apt-get install -y \ 9 | clang \ 10 | curl \ 11 | git \ 12 | vim \ 13 | build-essential \ 14 | libffi-dev \ 15 | libssl-dev \ 16 | zlib1g-dev \ 17 | libbz2-dev \ 18 | libreadline-dev \ 19 | libsqlite3-dev \ 20 | software-properties-common \ 21 | vim 22 | 23 | 24 | RUN add-apt-repository ppa:deadsnakes/ppa -y 25 | 26 | RUN apt install python3.9-dev -y 27 | 28 | RUN apt install -y python3-pip \ 29 | python3.9-distutils 30 | 31 | RUN python3.9 -m pip install --upgrade pip 32 | 33 | RUN python3.9 --version 34 | 35 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1 36 | RUN update-alternatives --set python /usr/bin/python3.9 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /alpha_codium/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import numpy as np 5 | 6 | 7 | def set_all_seeds(seed): 8 | random.seed(seed) 9 | np.random.seed(seed) 10 | os.environ["PYTHONHASHSEED"] = str(seed) 11 | 12 | try: 13 | import tensorflow as tf 14 | tf.random.set_seed(seed) 15 | except ImportError: 16 | pass 17 | 18 | try: 19 | import torch 20 | torch.manual_seed(seed) 21 | if torch.cuda.is_available(): 22 | torch.cuda.manual_seed(seed) 23 | torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. 24 | except ImportError: 25 | pass 26 | 27 | set_all_seeds(1337) -------------------------------------------------------------------------------- /alpha_codium/code_contests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/alpha_codium/code_contests/__init__.py -------------------------------------------------------------------------------- /alpha_codium/code_contests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/alpha_codium/code_contests/data/__init__.py -------------------------------------------------------------------------------- /alpha_codium/code_contests/data/prepare_and_clean_dataset.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import copy 3 | import json 4 | import os 5 | import shutil 6 | from collections import OrderedDict 7 | import time 8 | import numpy as np 9 | from datasets import Dataset 10 | 11 | from alpha_codium.code_contests.data.provider import CodeContestDataProvider 12 | from alpha_codium.log import get_logger, setup_logger 13 | from alpha_codium.gen.utils import evaluate_solution_on_subset 14 | from alpha_codium.settings.config_loader import get_settings 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | def preapare_and_clean_dataset(dataset_name='valid_and_test'): 20 | 21 | # process base dataset 22 | output_dataset_name = 'valid_and_test_processed' 23 | base_path = os.path.expanduser(get_settings().etl.private_dataset_cache_dir) 24 | output_path = os.path.join(base_path, output_dataset_name) 25 | data_provider = CodeContestDataProvider(dataset_location=dataset_name) 26 | 27 | # add and process the multiple_solutions field 28 | data_provider = add_multiple_solutions_field(data_provider) 29 | 30 | # will add 'is_valid_test' field to all problems 31 | data_provider = add_is_valid_field(data_provider) 32 | 33 | data_provider = problem_3_validation_fix(data_provider) 34 | data_provider = problem_29_test_fix(data_provider) 35 | data_provider = problem_92_test_fix(data_provider) 36 | 37 | # sorting so that 'python' solutions will be first 38 | data_provider = sort_solution_by_language(data_provider) 39 | 40 | # calc if there are valid solutions to the problem. if not, mark the problem as invalid 41 | data_provider = calc_is_valid_problem(data_provider) 42 | 43 | # save the dataset 44 | data_provider.dataset.save_to_disk(output_path) 45 | 46 | 47 | def calc_is_valid_problem(data_provider): 48 | get_settings().code_tester.sandbox = False 49 | th_correct = 0.2 # if less than 20% of the solutions are correct, mark the problem as invalid 50 | max_tests = 25 51 | 52 | for split_name in ['valid', 'test']: 53 | ds = data_provider.dataset[split_name] 54 | ds_dict = ds.to_dict() 55 | ds_dict['is_valid_problem'] = [True] * len(ds) 56 | solutions_list = ds_dict['solutions'] 57 | for i, solutions in enumerate(solutions_list): 58 | logger.info(f"processing problem {i} in split '{split_name}' for valid solutions") 59 | problem_dict = ds[i] 60 | s_list = solutions['solution'] 61 | l_list = solutions['language'] 62 | s_list = [s for s, l in zip(s_list, l_list) if 'python' in l.lower()] 63 | l_list = [l for l in l_list if 'python' in l.lower()] 64 | if len(s_list) < 5: 65 | logger.info(f"problem {i} in split '{split_name}' has less than 5 python solutions, cannot validate") 66 | continue 67 | test_failed_private_list = [] 68 | test_failed_generated_list = [] 69 | counter = 0 70 | timeout_len = 60 # 60 seconds 71 | start_time = time.time() 72 | for language, sol in zip(l_list, s_list): 73 | if 'python' not in language.lower(): 74 | continue 75 | counter += 1 76 | if counter > max_tests: 77 | continue 78 | if time.time() > start_time + timeout_len: 79 | continue 80 | # test_results, test_passed_public, test_failed_public, test_timeout_public \ 81 | # = evaluate_solution_on_subset('public_tests', problem_dict, sol, silent=True, break_on_timeout=True) 82 | test_results, test_passed_private, test_failed_private, test_timeout_private \ 83 | = evaluate_solution_on_subset('private_tests', problem_dict, sol, silent=True, 84 | break_on_timeout=True) 85 | test_results, test_passed_generate, test_failed_generate, test_timeout_generate \ 86 | = evaluate_solution_on_subset('generated_tests', problem_dict, sol, silent=True, 87 | break_on_timeout=True) 88 | test_failed_private_list.append(test_failed_private) 89 | test_failed_generated_list.append(test_failed_generate) 90 | if (time.time() > start_time + timeout_len) and counter > 10: 91 | continue 92 | if not test_failed_private_list: 93 | logger.info(f"problem {i} in split '{split_name}' has no python solutions") 94 | continue 95 | test_failed_private_list = np.array(test_failed_private_list) 96 | test_failed_generated_list = np.array(test_failed_generated_list) 97 | frac_correct = np.sum((test_failed_private_list + test_failed_generated_list) == 0) / len( 98 | test_failed_private_list) 99 | 100 | # final decision 101 | if frac_correct < th_correct: 102 | logger.info(f"Failed - problem {i} in split {split_name} is invalid, has {frac_correct*100}% correct solutions, " 103 | f"total of {len(test_failed_private_list)} solutions processed") 104 | ds_dict['is_valid_problem'][i] = False 105 | else: 106 | logger.info(f"Passed - problem {i} in split {split_name} is valid, has {frac_correct*100}% correct solutions, " 107 | f"total of {len(test_failed_private_list)} solutions processed") 108 | 109 | data_provider.dataset[split_name] = Dataset.from_dict(ds_dict) 110 | return data_provider 111 | def add_multiple_solutions_field(data_provider): 112 | for split_name in ['valid', 'test']: 113 | multiple_solutions_list = np.array([False] * len(data_provider.dataset[split_name])) 114 | ds = data_provider.dataset[split_name] 115 | for i, p in enumerate(ds): 116 | d_output = p['description'].split('Output\n')[1] 117 | if ('multiple solutions' in p['description'] or 'multiple possible solutions' in p['description'] 118 | or 'multiple possible solutions' in p['description'] or 'multiple' in d_output): 119 | # print(f"problem {i} has multiple solutions") 120 | # print(f"=========\n{p['description']}\n=======\n\n") 121 | multiple_solutions_list[i] = True 122 | else: 123 | multiple_solutions_list[i] = False 124 | 125 | data_provider.dataset[split_name] = data_provider.dataset[split_name].add_column('multiple_solutions', 126 | multiple_solutions_list) 127 | return data_provider 128 | 129 | 130 | def sort_solution_by_language(data_provider): 131 | # sorting so that 'python' solutions will be first 132 | for split_name in ['valid', 'test']: 133 | ds_dict = data_provider.dataset[split_name].to_dict() 134 | solutions_list = ds_dict['solutions'] 135 | for i, p in enumerate(solutions_list): 136 | np_lang = np.array(p['language']) 137 | ind_sorted = np.concatenate( 138 | (np.argwhere(np_lang == 'PYTHON3'), np.argwhere(np_lang == 'CPP'), np.argwhere(np_lang == 'JAVA'))) 139 | p['solution'] = [p['solution'][i[0]] for i in ind_sorted] 140 | p['language'] = [p['language'][i[0]] for i in ind_sorted] 141 | data_provider.dataset[split_name] = Dataset.from_dict(ds_dict) 142 | return data_provider 143 | def add_is_valid_field(data_provider): 144 | for split_name in ['valid', 'test']: 145 | ds_dict = data_provider.dataset[split_name].to_dict() 146 | ds_dict['public_tests'][0]['is_valid_test'] = None 147 | ds_dict['private_tests'][0]['is_valid_test'] = None 148 | ds_dict['generated_tests'][0]['is_valid_test'] = None 149 | data_provider.dataset[split_name] = Dataset.from_dict(ds_dict) 150 | return data_provider 151 | 152 | def problem_3_validation_fix(data_provider): 153 | # problem 3 validation fix generated tests 154 | ind_problem_valid = 3 155 | split_name = 'valid' 156 | dataset_dict = data_provider.dataset[split_name].to_dict() 157 | p_3 = data_provider.dataset[split_name][ind_problem_valid] 158 | p_3_generated_tests = p_3['generated_tests'] 159 | is_valid_test = [True] * len(p_3_generated_tests['input']) 160 | count_false = 0 161 | count_correct = 0 162 | for i, input in enumerate(p_3_generated_tests['input']): 163 | n, m, x = input.splitlines()[0].split() 164 | n = int(n) 165 | m = int(m) 166 | a = input.splitlines()[1].split() 167 | b = input.splitlines()[2].split() 168 | if (n != len(a) or m != len(b)): # according to the description, they should be equal 169 | count_false += 1 170 | is_valid_test[i] = False 171 | else: 172 | count_correct += 1 173 | dataset_dict['generated_tests'][ind_problem_valid]['is_valid_test'] = is_valid_test 174 | data_provider.dataset[split_name] = Dataset.from_dict(dataset_dict) 175 | return data_provider 176 | 177 | def problem_29_test_fix(data_provider): 178 | ind_problem_test = 29 179 | split_name = 'test' 180 | dataset_dict = data_provider.dataset[split_name].to_dict() 181 | p_29 = data_provider.dataset[split_name][ind_problem_test] 182 | p_29_generated_tests = p_29['generated_tests'] 183 | is_valid_arr_generated = [True] * len(p_29_generated_tests['input']) 184 | for i, input in enumerate(p_29_generated_tests['input']): 185 | for l in input.split(): 186 | l_n = np.array(list(map(int, l.split()))) 187 | if any(l_n < 0): # according to the description, they should be >=0 188 | is_valid_arr_generated[i] = False 189 | break 190 | 191 | s = input.split('\n', 1) 192 | n = int(s[0].strip()) 193 | a = s[1].strip().split('\n') 194 | for j in range(n): 195 | num_elements = int(a[2 * j].strip()) 196 | if num_elements != len(a[2 * j + 1].strip().split(' ')): # according to the description, they should be equal 197 | is_valid_arr_generated[i] = False 198 | break 199 | 200 | 201 | dataset_dict['generated_tests'][ind_problem_test]['is_valid_test'] = is_valid_arr_generated 202 | data_provider.dataset[split_name] = Dataset.from_dict(dataset_dict) 203 | return data_provider 204 | 205 | def problem_92_test_fix(data_provider): 206 | ind_problem_test = 92 207 | split_name = 'test' 208 | dataset_dict = data_provider.dataset[split_name].to_dict() 209 | p_92 = data_provider.dataset[split_name][ind_problem_test] 210 | p_92_private_tests = p_92['private_tests'] 211 | is_valid_arr_private = [True] * len(p_92_private_tests['input']) 212 | for i, input in enumerate(p_92_private_tests['input']): 213 | if len(set( 214 | input)) != 4: # {'a', 'b', '1', '\n'} - according to the description, the string should contain only 'a' and 'b' 215 | is_valid_arr_private[i] = False 216 | 217 | p_92_generated_tests = p_92['generated_tests'] 218 | is_valid_arr_generated = [True] * len(p_92_generated_tests['input']) 219 | for i, input in enumerate(p_92_generated_tests['input']): 220 | if len(set( 221 | input)) != 4: # {'a', 'b', '1', '\n'} - according to the description, the string should contain only 'a' and 'b' 222 | is_valid_arr_generated[i] = False 223 | 224 | dataset_dict['generated_tests'][ind_problem_test]['is_valid_test'] = is_valid_arr_generated 225 | dataset_dict['private_tests'][ind_problem_test]['is_valid_test'] = is_valid_arr_private 226 | data_provider.dataset[split_name] = Dataset.from_dict(dataset_dict) 227 | return data_provider 228 | 229 | if __name__ == "__main__": 230 | preapare_and_clean_dataset() 231 | -------------------------------------------------------------------------------- /alpha_codium/code_contests/data/provider.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | from typing import Iterable 4 | 5 | import duckdb 6 | import numpy as np 7 | import pandas as pd 8 | from datasets import Dataset, DatasetDict, load_dataset, load_from_disk 9 | from datasets.features.features import Sequence, Value 10 | 11 | from alpha_codium.settings.config_loader import get_settings 12 | 13 | problem_translations = ("source", "difficulty") 14 | 15 | solution_translations = ("solutions", "incorrect_solutions") 16 | 17 | 18 | class CodeContestDataProvider: 19 | 20 | def __init__(self, dataset_location, connection=None): 21 | self.private_datasets_root = os.path.expanduser( 22 | get_settings().config.private_dataset_cache_dir 23 | ) 24 | ( 25 | self.dataset_location, 26 | self.dataset_name, 27 | self.load_from_disk, 28 | ) = self.parse_location(dataset_location) 29 | self.dataset = self.load_dataset() 30 | self.connection = connection or duckdb.connect() 31 | self.connect(self.dataset) 32 | 33 | 34 | @staticmethod 35 | def find_problem(ds, problem_name, split_name=None, evaluation_test_type = None): 36 | if split_name: 37 | ds = ds[split_name] 38 | example = None 39 | if not problem_name: 40 | for e in ds: 41 | if evaluation_test_type: 42 | tests = e.get(evaluation_test_type) 43 | if tests and tests.get("input"): 44 | example = e 45 | break 46 | else: 47 | example = e 48 | break 49 | else: 50 | problems = ds.filter(lambda example: example['name'] == problem_name) 51 | if problems: 52 | example = problems[0] 53 | else: 54 | raise ValueError( 55 | f"problem with name {problem_name} doesn't exist in dataset {ds.info.dataset_name} in split {split_name}") 56 | return example 57 | 58 | @staticmethod 59 | def prepare_for_evaluation( 60 | predictions, source_of_truth, evaluation_test_type 61 | ): 62 | preds = predictions 63 | sot = source_of_truth 64 | sot = sot.select_columns(["name", evaluation_test_type]) 65 | sot = sot.rename_column("name", "task_name") 66 | sot = sot.flatten() 67 | sot = sot.rename_column(f"{evaluation_test_type}.input", "tests_inputs") 68 | sot = sot.rename_column(f"{evaluation_test_type}.output", "tests_outputs") 69 | 70 | joined = sot.to_pandas().merge(preds.to_pandas(), on="task_name", how="left") 71 | joined["predictions"] = joined[["task_name", "solution_candidates"]].to_dict( 72 | "records" 73 | ) 74 | joined["references"] = joined[["tests_inputs", "tests_outputs"]].to_dict( 75 | "records" 76 | ) 77 | 78 | # Retain only the 'predictions' and 'references' columns 79 | joined = joined[["predictions", "references"]] 80 | restructured_dataset = Dataset.from_pandas(joined) 81 | return restructured_dataset 82 | 83 | def parse_location(self, dataset_location): 84 | result_location = dataset_location 85 | dataset_name = dataset_location.split(os.path.sep)[-1] 86 | load_from_disk = True 87 | if load_from_disk: 88 | if not result_location.startswith(os.path.sep): 89 | result_location = os.path.join( 90 | self.private_datasets_root, result_location 91 | ) 92 | return result_location, dataset_name, load_from_disk 93 | 94 | @staticmethod 95 | def prepare_code_contest_split_for_eval( 96 | ds, evaluation_test_type="public_tests", task_name_column="name", 97 | path_to_solutions_column="solutions.solution" 98 | ): 99 | solutions = ds.flatten() 100 | solutions = solutions.rename_column( 101 | path_to_solutions_column, "solution_candidates" 102 | ) 103 | solutions = solutions.rename_column(task_name_column, "task_name") 104 | solutions = solutions.select_columns(["task_name", "solution_candidates"]) 105 | return CodeContestDataProvider.prepare_for_evaluation( 106 | predictions=solutions, 107 | source_of_truth=ds, 108 | evaluation_test_type=evaluation_test_type, 109 | ) 110 | 111 | def show(self, ds, paths_to_python, paths_to_free_text): 112 | result = ds.flatte() 113 | 114 | def format_example(example): 115 | for code_col in paths_to_python: 116 | import black 117 | 118 | example[code_col] = black.format_str(example[code_col]) 119 | for col in paths_to_free_text: 120 | example[col] = example[col].replace("\\n", "\n") 121 | 122 | pretty = result.map(format_example) 123 | return pretty 124 | 125 | def load_dataset(self): 126 | if self.load_from_disk: 127 | f = load_from_disk 128 | else: 129 | f = load_dataset 130 | 131 | return f(self.dataset_location) 132 | 133 | def connect(self, ds): 134 | if hasattr(ds, "keys"): 135 | for split in self.dataset.keys(): 136 | split_ds = self.dataset[split] 137 | table = split_ds.data.table 138 | self.connection.register(f"{split_ds.info.dataset_name}_{split}", table) 139 | else: 140 | self.connection.register(f"{ds.info.dataset_name}", ds.data.table) 141 | 142 | def get_splits(self): 143 | return self.dataset.keys() 144 | 145 | @staticmethod 146 | def sample(ds, fraction=0.1): 147 | table = ds 148 | sample_size = int(len(table) * fraction) 149 | indices = np.random.choice(len(table), sample_size, replace=False) 150 | sampled_table = table.select(indices) 151 | return sampled_table 152 | 153 | def query(self, query_string) -> pd.DataFrame: 154 | return self.connection.query(query_string).df() 155 | 156 | def translate_references(self, ds): 157 | expand = False 158 | if not isinstance(ds, DatasetDict): 159 | to_translate = {"ds": ds} 160 | expand = True 161 | else: 162 | to_translate = ds 163 | for ds_name, ds_val in to_translate.items(): 164 | for col in problem_translations: 165 | translated_col = ds_val.features[col].int2str(ds_val[col]) 166 | ds_val = ds_val.remove_columns([col]) 167 | ds_val = ds_val.add_column(col, translated_col) 168 | 169 | def translate_sequence_references(example, ds): 170 | for col in solution_translations: 171 | translator = ds.features[col].feature["language"] 172 | arr = example[col]["language"] 173 | translated_solution = [translator.int2str(item) for item in arr] 174 | example[col]["language"] = translated_solution 175 | 176 | return example 177 | 178 | new_features = ds_val.features.copy() 179 | for col in solution_translations: 180 | new_features[col] = Sequence( 181 | feature={"language": Value("string"), "solution": Value("string")} 182 | ) 183 | 184 | ds_val = ds_val.map( 185 | lambda example, ds=ds_val: translate_sequence_references( 186 | example=example, ds=ds 187 | ), 188 | features=new_features, 189 | ) 190 | to_translate[ds_name] = ds_val 191 | result = to_translate 192 | if expand: 193 | result = result[ds] 194 | return result 195 | 196 | def filter_solution_by_languages(self, ds, languages: Iterable[str], keep=True): 197 | languages_set = set(languages) 198 | 199 | def filter_solutions_by_languages(example): 200 | for sol_col in solution_translations: 201 | langs = example[sol_col]["language"] 202 | sols = example[sol_col]["solution"] 203 | 204 | filtered_languages = [ 205 | lang for lang in langs if (lang in languages_set) == keep 206 | ] 207 | filtered_solutions = [ 208 | s 209 | for idx, s in enumerate(sols) 210 | if (langs[idx] in languages_set) == keep 211 | ] 212 | 213 | example[sol_col] = { 214 | "language": filtered_languages, 215 | "solution": filtered_solutions, 216 | } 217 | 218 | return example 219 | 220 | ds = ds.map(filter_solutions_by_languages) 221 | return ds 222 | -------------------------------------------------------------------------------- /alpha_codium/code_contests/data/yaml_vs_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import yaml 3 | s1 = 'print("double quote string")' 4 | s2 = "print('single quote string')" 5 | s3 = 'print("""triple quote string""")' 6 | s4 = f"{s1}\n{s2}\n{s3}" 7 | 8 | # Create a dictionary with keys as variable names and values as the strings 9 | data = {'s1': s1, 's2': s2, 's3': s3, 's4': s4} 10 | 11 | # Convert the dictionary to a JSON-formatted string 12 | json_data = json.dumps(data, indent=2) 13 | print(json_data) 14 | 15 | # Convert the dictionary to a YAML-formatted string, with block scalar style 16 | yaml_data = yaml.dump(data, indent=2, default_style='|') 17 | print(yaml_data) 18 | 19 | -------------------------------------------------------------------------------- /alpha_codium/code_contests/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/alpha_codium/code_contests/eval/__init__.py -------------------------------------------------------------------------------- /alpha_codium/code_contests/eval/code_contests_metric.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """The CodeContestsEval metric estimates the pass@k metric for code synthesis. 15 | This is an evaluation harness for the code_contests problem solving dataset 16 | described in the paper "Evaluating Large Language Models Trained on Code" 17 | (https://arxiv.org/abs/2107.03374).""" 18 | import itertools 19 | import os 20 | 21 | import datasets 22 | import evaluate 23 | import numpy as np 24 | 25 | from alpha_codium.code_contests.eval.code_test_runners import PythonTestsRunner 26 | 27 | local_runner = 'local' 28 | code_contests_runner = 'code_contests' 29 | 30 | 31 | _CITATION = """\ 32 | 33 | """ 34 | 35 | _DESCRIPTION = """\ 36 | This metric implements the evaluation harness for Deepmind's code_contests dataset. 37 | """ 38 | 39 | _KWARGS_DESCRIPTION = """ 40 | Calculates how good are predictions given some references, using certain scores 41 | Args: 42 | predictions: list of candidates to evaluate. Each candidates should be a list 43 | of strings with several code candidates to solve the problem. 44 | references: a list with a test for each prediction. Each test should evaluate the 45 | correctness of a code candidate. 46 | k: number of code candidates to consider in the evaluation (Default: [1, 10, 100]) 47 | num_workers: number of workers used to evaluate the candidate programs (Default: 4). 48 | timeout: 49 | Returns: 50 | pass_at_k: dict with pass rates for each k 51 | results: dict with granular results of each unittest 52 | Examples: 53 | >>> code_eval = evaluate.load("code_eval") 54 | >>> test_cases = ["assert add(2,3)==5"] 55 | >>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]] 56 | >>> pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2]) 57 | >>> print(pass_at_k) 58 | {'pass@1': 0.5, 'pass@2': 1.0} 59 | """ 60 | 61 | _WARNING = """ 62 | ################################################################################ 63 | !!!WARNING!!! 64 | ################################################################################ 65 | The "code_eval" metric executes untrusted model-generated code in Python. 66 | Although it is highly unlikely that model-generated code will do something 67 | overtly malicious in response to this test suite, model-generated code may act 68 | destructively due to a lack of model capability or alignment. 69 | Users are strongly encouraged to sandbox this evaluation suite so that it 70 | does not perform destructive actions on their host or network. For more 71 | information on how OpenAI sandboxes its code, see the paper "Evaluating Large 72 | Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). 73 | 74 | Once you have read this disclaimer and taken appropriate precautions, 75 | set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this 76 | with: 77 | 78 | >>> import os 79 | >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1" 80 | 81 | ################################################################################\ 82 | """ 83 | 84 | _LICENSE = """The MIT License 85 | 86 | Copyright (c) OpenAI (https://openai.com) 87 | 88 | Permission is hereby granted, free of charge, to any person obtaining a copy 89 | of this software and associated documentation files (the "Software"), to deal 90 | in the Software without restriction, including without limitation the rights 91 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 92 | copies of the Software, and to permit persons to whom the Software is 93 | furnished to do so, subject to the following conditions: 94 | 95 | The above copyright notice and this permission notice shall be included in 96 | all copies or substantial portions of the Software. 97 | 98 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 99 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 100 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 101 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 102 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 103 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 104 | THE SOFTWARE.""" 105 | 106 | 107 | os.environ["HF_ALLOW_CODE_EVAL"] = "1" 108 | 109 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 110 | class CodeContestsEval(evaluate.Metric): 111 | def _info(self): 112 | if self.config_name not in [ 113 | local_runner, code_contests_runner 114 | ]: 115 | raise KeyError( 116 | "You should supply a configuration name selected in " 117 | f'[{code_contests_runner}, {local_runner}]' 118 | ) 119 | return evaluate.MetricInfo( 120 | # This is the description that will appear on the metrics page. 121 | description=_DESCRIPTION, 122 | citation=_CITATION, 123 | inputs_description=_KWARGS_DESCRIPTION, 124 | # This defines the format of each prediction and reference 125 | features=datasets.Features( 126 | { 127 | "predictions": { 128 | "task_name": datasets.Value("string"), 129 | "solution_candidates": datasets.Sequence( 130 | datasets.Value("string") 131 | ), 132 | }, 133 | "references": { 134 | "tests_inputs": datasets.Sequence(datasets.Value("string")), 135 | "tests_outputs": datasets.Sequence(datasets.Value("string")), 136 | }, 137 | } 138 | ), 139 | homepage="", 140 | codebase_urls=[""], 141 | reference_urls=[""], 142 | license=_LICENSE, 143 | ) 144 | 145 | def _compute( 146 | self, 147 | predictions, 148 | references, 149 | k=[1, 10, 100], # noqa: B006 150 | num_workers=10, 151 | timeout=3.0, 152 | ): 153 | if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1": 154 | raise ValueError(_WARNING) 155 | 156 | if os.name == "nt": 157 | raise NotImplementedError( 158 | "This metric is currently not supported on Windows." 159 | ) 160 | runner = PythonTestsRunner.factory(self.config_name) 161 | inputs, results = runner.bulk_test(num_workers, predictions, references) 162 | correct, total = self.pass_fail_ratio(results) 163 | total = np.array(total) 164 | correct = np.array(correct) 165 | ks = k 166 | pass_at_k = { 167 | f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() 168 | for k in ks 169 | if (total >= k).all() 170 | } 171 | 172 | return pass_at_k, inputs, results 173 | 174 | def pass_fail_ratio(self, results): 175 | total, correct = [], [] 176 | for task_id, all_candidates_test_results in results.items(): 177 | print(task_id) 178 | print("======================================") 179 | candidate_final_results = [] 180 | for candidate_id, test_results in enumerate(all_candidates_test_results): 181 | _results = [ 182 | test_result.passed for test_result in test_results.test_results 183 | ] 184 | print(f"{candidate_id} test results: {_results}") 185 | candidate_pass_fail = all(_results) 186 | print(f"{candidate_id} final pass/fail: {candidate_pass_fail}") 187 | candidate_final_results.append(candidate_pass_fail) 188 | total.append(len(candidate_final_results)) 189 | correct.append(sum(candidate_final_results)) 190 | print(f"{task_id} candidates: {candidate_final_results}") 191 | print("======================================") 192 | return correct, total 193 | 194 | 195 | def estimate_pass_at_k(num_samples, num_correct, k): 196 | """Estimates pass@k of each problem and returns them in an array.""" 197 | 198 | def estimator(n: int, c: int, k: int) -> float: 199 | """Calculates 1 - comb(n - c, k) / comb(n, k).""" 200 | if n - c < k: 201 | return 1.0 202 | denominator = np.math.factorial(n) / (np.math.factorial(k) * np.math.factorial(n - k)) 203 | numerator = 1.0 204 | for i in range(n - c + 1, n + 1): 205 | numerator *= 1.0 - k / i 206 | return 1.0 - numerator / denominator 207 | 208 | if isinstance(num_samples, int): 209 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 210 | else: 211 | assert len(num_samples) == len(num_correct) 212 | num_samples_it = iter(num_samples) 213 | 214 | return np.array( 215 | [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] 216 | ) 217 | -------------------------------------------------------------------------------- /alpha_codium/code_contests/eval/pass_at_k_evaluator.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from evaluate import load as load_metric 4 | 5 | from alpha_codium.code_contests.data.provider import CodeContestDataProvider 6 | from alpha_codium.settings.config_loader import get_settings 7 | 8 | 9 | def calculate_metrics(ds, k_values=[1, 10, 100]): # noqa: B006 10 | 11 | metric_path = os.path.join( 12 | os.path.dirname(os.path.abspath(__file__)), "code_contests_metric.py" 13 | ) 14 | metric = load_metric(metric_path, config_name=get_settings().code_tester.tester_type, module_type="metric") 15 | pass_at_k, inputs, result = metric.compute( 16 | predictions=ds["predictions"], references=ds["references"], k=k_values 17 | ) 18 | return pass_at_k, inputs, result 19 | 20 | 21 | def evaluate_code_contest_dataset( 22 | dataset_name, 23 | split_name='valid', 24 | k_values=[1, 10, 10], # noqa: B006 25 | evaluation_test_type='private_tests', 26 | path_to_solutions_column='solutions.solution', 27 | task_name_column='name', 28 | sample_rate=0.1, 29 | ): 30 | cc = CodeContestDataProvider(dataset_name) 31 | ds = cc.dataset[split_name] 32 | ds = cc.sample(ds, fraction=sample_rate) 33 | ds = CodeContestDataProvider.prepare_code_contest_split_for_eval(ds=ds, 34 | evaluation_test_type=evaluation_test_type, 35 | task_name_column=task_name_column, 36 | path_to_solutions_column=path_to_solutions_column) 37 | pass_at_k, inputs, result = calculate_metrics(ds, k_values=k_values) 38 | print(pass_at_k) 39 | 40 | 41 | def evaluate_gen_dataset(evaluation_test_type, ground_truth_dataset, ground_truth_split, k_values, solution_dataset): 42 | evaluation_set = CodeContestDataProvider(dataset_location=solution_dataset) 43 | gt_set = CodeContestDataProvider(dataset_location=ground_truth_dataset).dataset 44 | if ground_truth_split: 45 | gt_set = gt_set[ground_truth_split] 46 | prepared_solutions = evaluation_set.prepare_for_evaluation(evaluation_set.dataset, gt_set, 47 | evaluation_test_type=evaluation_test_type) 48 | pass_at_k, inputs, evaluation_results = calculate_metrics(prepared_solutions, k_values) 49 | print(pass_at_k) 50 | 51 | 52 | if __name__ == "__main__": 53 | evaluate_code_contest_dataset("assaf_test", evaluation_test_type="private_tests", sample_rate=0.05) 54 | -------------------------------------------------------------------------------- /alpha_codium/code_contests/eval/tracer.py: -------------------------------------------------------------------------------- 1 | import pysnooper 2 | 3 | from alpha_codium.settings.config_loader import get_settings 4 | 5 | filter_out_lines = ["Starting var:", 6 | "exec(", 7 | "Source path:", 8 | "globals_dict", 9 | "tracer.py", 10 | "snooping_inner_function()", 11 | "run_generated_code", 12 | "return function(*args, **kwargs)", 13 | "source_line = source[line_no - 1]" 14 | "Elapsed time:", 15 | "Return value:.. None"] 16 | 17 | snooper_kwargs = { 18 | 'color': False, 19 | 'relative_time': True, 20 | 'normalize': True, 21 | 'depth': get_settings().code_tester.trace_depth 22 | } 23 | 24 | snooper_kwargs_string = ", ".join(f"{key}={value}" for key, value in snooper_kwargs.items()) 25 | 26 | 27 | class FilteringTracer(pysnooper.tracer.Tracer): 28 | def trace(self, frame, event, arg): 29 | if not frame.f_code.co_filename == '': 30 | return None 31 | 32 | return super().trace(frame, event, arg) 33 | 34 | 35 | class MockSourceLoader: 36 | 37 | def __init__(self, source): 38 | self.source = source 39 | 40 | def get_source(self, module_name): 41 | return self.source 42 | 43 | def wrap_solution(check_program): 44 | import_str = "import pysnooper as pysnooper\n" 45 | annotation = f"@custom_snoop(output=tracing, {snooper_kwargs_string})\n" 46 | entrypoint = "def run_code_contests_solution():\n" 47 | func_body = "\n".join([f"\t{line}" for line in check_program.split("\n")]) 48 | invocation = "\nrun_code_contests_solution()" 49 | return (import_str + annotation + entrypoint + func_body + invocation).strip() 50 | 51 | 52 | def trace_code(check_program, tracing): 53 | my_program = wrap_solution(check_program) 54 | # __name__ must be unique otherwise the tracer uses caching mechanisms that break it's behavior 55 | globals_dict = {'__loader__': MockSourceLoader(my_program), 56 | 'tracing': tracing, 57 | '__name__': hash(my_program), 58 | 'custom_snoop': FilteringTracer} 59 | exec(my_program, globals_dict, {}) 60 | 61 | 62 | def clean_trace(trace_output): 63 | trace_lines = trace_output.split("\n") 64 | clean_lines = [line for line in trace_lines if not 65 | any(substring in line for substring in filter_out_lines)] 66 | clean_output = "\n".join(clean_lines) 67 | return clean_output 68 | -------------------------------------------------------------------------------- /alpha_codium/evaluate_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from collections import OrderedDict 4 | 5 | from alpha_codium.code_contests.data.provider import CodeContestDataProvider 6 | from alpha_codium.log import get_logger 7 | 8 | logger = get_logger(__name__) 9 | 10 | def evaluate_dataset_solution(dataset_name='valid_and_test_processed', 11 | split_name='test', 12 | solution_path_database='valid_database_solution.json'): 13 | """ 14 | Evaluate the performance of dataset solutions. 15 | 16 | Args: 17 | dataset_name (str, optional): The name of the dataset. Defaults to 'valid_and_test_processed'. 18 | split_name (str, optional): The name of the split. Defaults to 'test'. 19 | solution_path_database (str, optional): The path to the solution database file. Defaults to 'valid_database_solution.json'. 20 | """ 21 | 22 | # Load the dataset and solution database 23 | data_provider = CodeContestDataProvider(dataset_location=dataset_name) 24 | ds = data_provider.dataset[split_name] 25 | with open(solution_path_database, 'r') as f: 26 | database_solutions = json.load(f) 27 | database_solutions[split_name] = OrderedDict( 28 | sorted(database_solutions[split_name].items(), key=lambda x: int(x[0]))) 29 | 30 | # Initialize counters for passed and failed problems 31 | total_passed = 0 32 | total_failed = 0 33 | 34 | # Iterate over the solutions in the database 35 | for sol in database_solutions[split_name]: 36 | try: 37 | key_str = sol 38 | key_int = int(key_str) 39 | problem = ds[key_int] 40 | if problem.get('is_valid_problem', True) is False: 41 | print(f"problem {key_int} is not valid") 42 | continue 43 | solution = database_solutions[split_name][sol] 44 | passed_current = -1 45 | 46 | # scanning the iterations 47 | v_iter =[v for v in solution.values() if (v is not None and 'solution' in v)] 48 | for v in v_iter: 49 | if not v: 50 | continue 51 | test_failed_generate = v['test_failed_generate'] 52 | test_failed_private = v['test_failed_private'] 53 | test_passed_generate = v['test_passed_generate'] 54 | test_passed_private = v['test_passed_private'] 55 | if 'test_timeout_generate' in v: 56 | test_timeout_generate = v['test_timeout_generate'] 57 | test_timeout_private = v['test_timeout_private'] 58 | else: 59 | test_timeout_generate = 0 60 | test_timeout_private = 0 61 | 62 | if ((test_failed_generate + test_timeout_generate + test_failed_private + test_timeout_private) == 0 and 63 | (test_passed_generate + test_passed_private) > 0): 64 | print(f"problem {key_int} passed all tests") 65 | passed_current=1 66 | break 67 | else: 68 | passed_current = 0 69 | if passed_current == 1: 70 | total_passed += 1 71 | elif passed_current == 0: 72 | total_failed += 1 73 | except Exception as e: 74 | print(f"Error: {e}") 75 | pass 76 | 77 | # Print the total number of passed and failed problems 78 | print(f"total_passed: {total_passed}, total_failed: {total_failed}") 79 | 80 | # Calculate the pass rate 81 | pass_rate = total_passed / (total_passed + total_failed) 82 | print(f"pass rate: {pass_rate}") 83 | 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument("--dataset_name", type=str, default="valid_and_test_processed") 86 | parser.add_argument("--split_name", type=str, default="valid") 87 | parser.add_argument("--database_solution_path", type=str, default="./gpt_3_solution_database_valid.json") 88 | 89 | if __name__ == "__main__": 90 | args = parser.parse_args() 91 | evaluate_dataset_solution(dataset_name=args.dataset_name, 92 | split_name=args.split_name, 93 | solution_path_database=args.database_solution_path) 94 | -------------------------------------------------------------------------------- /alpha_codium/gen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/alpha_codium/gen/__init__.py -------------------------------------------------------------------------------- /alpha_codium/gen/coding_competitor.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | from jinja2 import Environment, StrictUndefined 5 | 6 | from alpha_codium.code_contests.data.provider import CodeContestDataProvider 7 | from alpha_codium.gen.stages.run_baseline import run_baseline 8 | from alpha_codium.gen.stages.run_choose_best_solution import run_choose_best_solution 9 | from alpha_codium.gen.stages.run_evaluate_all_ai_tests import run_evaluate_all_ai_tests 10 | from alpha_codium.gen.stages.run_evaluate_public_tests import run_evaluate_public_tests 11 | from alpha_codium.gen.stages.run_generate_ai_test import run_generate_ai_tests 12 | from alpha_codium.gen.stages.run_generate_possible_solutions import run_generate_possible_solutions 13 | from alpha_codium.gen.stages.run_self_reflect import run_self_reflect 14 | from alpha_codium.gen.stages.run_initial_code_generation import run_initial_code_generation 15 | from alpha_codium.gen.stages.utils import set_configurations 16 | from alpha_codium.gen.utils import evaluate_solution_on_subset 17 | from alpha_codium.llm.ai_handler import AiHandler 18 | from alpha_codium.log import get_logger 19 | from alpha_codium.settings.config_loader import get_settings 20 | 21 | 22 | class CodeContestsCompetitor: 23 | def __init__(self): 24 | self.prompt = {} 25 | for set in get_settings(): 26 | if 'code_contests_prompt' in set.lower(): 27 | self.prompt[set.lower()] = get_settings()[set] 28 | self.ai_handler = AiHandler() 29 | 30 | def render(self, problem_json, prompt: str): 31 | environment = Environment(undefined=StrictUndefined) 32 | environment.globals["zip"] = zip 33 | environment.globals["enumerate"] = enumerate 34 | sys_prompt = environment.from_string(self.prompt[prompt].system).render(problem_json) 35 | usr_prompt = environment.from_string(self.prompt[prompt].user).render(problem_json) 36 | if hasattr(self.prompt[prompt], 'temperature'): 37 | temperature = self.prompt[prompt].temperature 38 | else: 39 | temperature = 0.2 40 | if hasattr(self.prompt[prompt], 'frequency_penalty'): 41 | frequency_penalty = self.prompt[prompt].frequency_penalty 42 | else: 43 | frequency_penalty = None 44 | return sys_prompt, usr_prompt, temperature, frequency_penalty 45 | 46 | async def _run(self, model, problem, prompt:str = "code_contests_prompt_reflect"): 47 | system_prompt, user_prompt, temperature, frequency_penalty = self.render(problem, prompt) 48 | 49 | if frequency_penalty == None: 50 | frequency_penalty = get_settings().get("config.frequency_penalty") 51 | 52 | response, finish_reason = await self.ai_handler.chat_completion( 53 | model=model, system=system_prompt, user=user_prompt, 54 | temperature=temperature, frequency_penalty=frequency_penalty, 55 | ) 56 | return response, finish_reason 57 | 58 | async def run(self, problem, iteration=0, logger_ext=None): 59 | if logger_ext: 60 | logger = logger_ext 61 | else: 62 | logger = get_logger(__name__) 63 | logger.info(f"Running code contests competitor, model {get_settings().config['model']}") 64 | 65 | try: 66 | if get_settings().get("solve.use_baseline", False): 67 | problem['code_recent_solution'] = await run_baseline(self, problem) 68 | else: 69 | # configurations 70 | problem = set_configurations(problem, iteration) 71 | 72 | # self-reflect 73 | problem = await run_self_reflect(self, problem) 74 | 75 | # generate solutions 76 | problem = await run_generate_possible_solutions(self, problem) 77 | 78 | # choose best solution 79 | problem = await run_choose_best_solution(self, problem) 80 | 81 | # generate ai tests 82 | problem = await run_generate_ai_tests(self, problem) 83 | 84 | # initial code generation 85 | problem = await run_initial_code_generation(self, problem) 86 | 87 | # evaluate on public tests 88 | problem = await run_evaluate_public_tests(self, problem) 89 | 90 | # evaluate on ai tests 91 | problem = await run_evaluate_all_ai_tests(self, problem) 92 | 93 | return problem['code_recent_solution'] 94 | except Exception as e: 95 | logging.error(f"Error: {e}") 96 | return "" 97 | 98 | def solve_problem_in_dataset(self, example, iteration=0, logger_ext=None): 99 | problem = {k: example.get(k) for k in ["name", "description", 'public_tests']} 100 | prediction = asyncio.run(self.run(problem=problem, iteration=iteration, logger_ext=logger_ext)) 101 | return prediction 102 | 103 | 104 | def solve_problem(dataset_name, 105 | split_name="valid", 106 | problem_name="", 107 | problem_number=0): 108 | 109 | # load dataset 110 | logger = get_logger(__name__) 111 | data_provider = CodeContestDataProvider(dataset_location=dataset_name) 112 | if problem_number and problem_name: 113 | logger.info(f"problem_number and problem_name are both specified, using problem_name") 114 | if not problem_name and problem_number: 115 | problem_name = data_provider.dataset[split_name][int(problem_number)]['name'] 116 | logger.info(f"problem_name: {problem_name}") 117 | 118 | # find problem 119 | problem = data_provider.find_problem(ds=data_provider.dataset, problem_name=problem_name, split_name=split_name) 120 | logger.info(f"problem['name']: {problem['name']}") 121 | 122 | # # check if problem is valid (at least one of the provided solutions actually passes the generated tests) 123 | # if not problem.get('is_valid_problem', True): 124 | # logger.info(f"problem['is_valid_problem'] == False, skipping") 125 | # return None, None 126 | 127 | # evaluate prev solutions 128 | evaluate_prev_solutions = get_settings().get("dataset.evaluate_prev_solutions", False) 129 | if evaluate_prev_solutions: 130 | try: 131 | if not problem['solutions']['solution']: 132 | logger.info("No public solutions for this problem") 133 | found_solution = False 134 | for index_published, sol_published in enumerate(problem['solutions']['solution']): 135 | if 'python' not in problem['solutions']['language'][index_published].lower(): 136 | found_solution = True 137 | continue 138 | logger.info(f"evaluating public solution {index_published} on private tests...") 139 | test_results, test_passed_private, test_failed_private, test_timeout_private \ 140 | = evaluate_solution_on_subset('private_tests', problem, sol_published, silent=True) 141 | logger.info(f"evaluating public solution {index_published} on generated tests...") 142 | test_results, test_passed_generate, test_failed_generate, test_timeout_generate = ( 143 | evaluate_solution_on_subset('generated_tests', problem, sol_published, silent=True)) 144 | 145 | if (test_failed_private == test_failed_generate == test_timeout_private == test_timeout_generate == 0) \ 146 | and test_passed_private + test_passed_generate > 0: 147 | logger.info(f"sol_published index {index_published} passed all tests:\n{sol_published}") 148 | found_solution = True 149 | break 150 | 151 | if not found_solution: 152 | logger.info(f"None of the public solutions passed all tests") 153 | except Exception as e: 154 | logger.error(f"Error evaluating public solutions: {e}") 155 | pass 156 | 157 | 158 | return solve_my_problem(problem) 159 | 160 | 161 | def solve_my_problem(problem): 162 | 163 | base_path = os.getcwd() 164 | logger = get_logger(__name__) 165 | 166 | solver = CodeContestsCompetitor() 167 | os.chdir(base_path) 168 | solution = solver.solve_problem_in_dataset(problem) 169 | logger.info(f"testing solution on private tests with prediction:\n{solution}") 170 | 171 | logger.info(f"evaluating solution on public tests...") 172 | test_results, test_passed_public, test_failed_public, test_timeout_public = evaluate_solution_on_subset('public_tests', 173 | problem, 174 | solution, 175 | silent=True) 176 | 177 | 178 | logger.info(f"evaluating solution on private tests...") 179 | test_results, test_passed_private, test_failed_private, test_timeout_private = evaluate_solution_on_subset('private_tests', 180 | problem, 181 | solution, 182 | silent=True) 183 | 184 | logger.info(f"evaluating solution on generated tests...") 185 | test_results, test_passed_generate, test_failed_generate, test_timeout_generate = evaluate_solution_on_subset( 186 | 'generated_tests', problem, solution, silent=True) 187 | 188 | logger.info(f"\ntest_passed_generate: {test_passed_generate}, test_passed_private: {test_passed_private}, test_passed_public: {test_passed_public}" 189 | f"\ntest_failed_generate: {test_failed_generate}, test_failed_private: {test_failed_private}, test_failed_public: {test_failed_public}" 190 | f"\ntest_timeout_generate: {test_timeout_generate}, test_timeout_private: {test_timeout_private}, test_timeout_public: {test_timeout_public}") 191 | 192 | return solution, test_results 193 | -------------------------------------------------------------------------------- /alpha_codium/gen/dataset_solver.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | from collections import OrderedDict 5 | 6 | from alpha_codium.code_contests.data.provider import CodeContestDataProvider 7 | from alpha_codium.gen.coding_competitor import CodeContestsCompetitor 8 | from alpha_codium.gen.utils import evaluate_solution_on_subset 9 | from alpha_codium.log import setup_logger, get_logger 10 | from alpha_codium.settings.config_loader import get_settings 11 | 12 | 13 | def solve_dataset(dataset_name='valid_and_test_processed', 14 | split_name='valid', 15 | database_solution_path='solution_database.json'): 16 | 17 | # load dataset 18 | data_provider = CodeContestDataProvider(dataset_location=dataset_name) 19 | setting = get_settings() 20 | num_problems = len(data_provider.dataset[split_name]) 21 | base_path = os.getcwd() 22 | setting.solve.reduce_verbose = True 23 | 24 | ## load previous solution-database if exists 25 | try: 26 | with open(database_solution_path, 'r') as f: 27 | database = json.load(f) 28 | database[split_name] = OrderedDict(sorted(database[split_name].items(), key=lambda x: int(x[0]))) 29 | except: 30 | print(f"Failed to load database from {database_solution_path}") 31 | database = {split_name: {}} 32 | 33 | # iterate on problems 34 | for problem_number in range(0, num_problems): 35 | 36 | # skip if already ran 37 | logger = setup_logger() 38 | 39 | num_iterations = setting.get("dataset.num_iterations", 1) 40 | prev = database[split_name].get(str(problem_number), {}).get(f'iteration_{num_iterations-1}', {}) 41 | if not ((prev == {}) or (prev is None)): 42 | print(f"problem_number {problem_number} already ran") 43 | continue 44 | 45 | # check if problem is valid (at least one of the provided solutions actually passes the generated tests) 46 | if data_provider.dataset[split_name][problem_number].get('is_valid_problem', True) is False: 47 | logger.info(f"problem {problem_number} is not valid") 48 | continue 49 | 50 | os.chdir(base_path) 51 | logger.info(f"problem_number: {problem_number}") 52 | problem_name = data_provider.dataset[split_name][int(problem_number)]['name'] 53 | logger.info(f"problem_name: {problem_name}") 54 | problem = data_provider.find_problem(ds=data_provider.dataset, problem_name=problem_name, split_name=split_name) 55 | logger.info(f"problem['cf_tags']: {problem['cf_tags']}") 56 | 57 | # solve problem 58 | problem_database = {problem_number: {}} 59 | solver = CodeContestsCompetitor() 60 | for iteration in range(setting.get("dataset.num_iterations", 1)): 61 | it_str = f"iteration_{iteration}" 62 | problem_database[problem_number][it_str] = {} 63 | 64 | # skip if iteration already ran 65 | prev_iter = database[split_name].get(str(problem_number), {}).get(it_str, {}) 66 | if not ((prev_iter == {}) or (prev_iter is None)): 67 | print(f"prev_iter {iteration} already ran") 68 | problem_database[problem_number][it_str] = prev_iter 69 | if is_solved(prev_iter): 70 | logger.info(f"codium solved problem {problem_number} in iteration {iteration}") 71 | break 72 | continue 73 | 74 | # solve problem 75 | solution = solver.solve_problem_in_dataset(problem, iteration, logger) 76 | 77 | logger.info(f"solution code:\n{solution}") 78 | if not solution: 79 | logger.info(f"Failed to solve problem {problem_number} in iteration {iteration}") 80 | continue 81 | logger.info(f"Evaluating solution on public tests...") 82 | test_results, test_passed_public, test_failed_public, test_timeout_public = evaluate_solution_on_subset( 83 | 'public_tests', problem, solution, silent=True) 84 | 85 | logger.info(f"evaluating solution on private tests...") 86 | test_results, test_passed_private, test_failed_private, test_timeout_private = evaluate_solution_on_subset( 87 | 'private_tests', problem, solution, silent=True) 88 | 89 | logger.info(f"evaluating solution on generated tests...") 90 | test_results, test_passed_generate, test_failed_generate, test_timeout_generate = evaluate_solution_on_subset( 91 | 'generated_tests', problem, solution, silent=True) 92 | 93 | logger.info( 94 | f"\ntest_passed_public: {test_passed_public}, test_failed_public: {test_failed_public}, test_timeout_public: {test_timeout_public}\n" 95 | f"test_passed_private: {test_passed_private}, test_failed_private: {test_failed_private}, test_timeout_private: {test_timeout_private}\n" 96 | f"test_passed_generate: {test_passed_generate}, test_failed_generate: {test_failed_generate}, test_timeout_generate: {test_timeout_generate}\n") 97 | 98 | problem_database[problem_number][it_str]['solution'] = solution 99 | problem_database[problem_number][it_str]['test_passed_private'] = test_passed_private 100 | problem_database[problem_number][it_str]['test_failed_private'] = test_failed_private 101 | problem_database[problem_number][it_str]['test_timeout_private'] = test_timeout_private 102 | problem_database[problem_number][it_str]['test_passed_generate'] = test_passed_generate 103 | problem_database[problem_number][it_str]['test_failed_generate'] = test_failed_generate 104 | problem_database[problem_number][it_str]['test_timeout_generate'] = test_timeout_generate 105 | problem_database[problem_number][it_str]['test_passed_public'] = test_passed_public 106 | problem_database[problem_number][it_str]['test_failed_public'] = test_failed_public 107 | problem_database[problem_number][it_str]['test_timeout_public'] = test_timeout_public 108 | os.chdir(base_path) 109 | if is_solved(problem_database[problem_number][it_str]): 110 | logger.info(f"codium solved problem {problem_number} in iteration {iteration}") 111 | break 112 | else: 113 | logger.info(f"codium failed to solve problem {problem_number} in iteration {iteration}") 114 | database[split_name][problem_number] = problem_database[problem_number] 115 | os.chdir(base_path) 116 | with open(database_solution_path, 'w') as f: 117 | json.dump(database, f) 118 | 119 | 120 | def is_solved(s): 121 | if s['test_failed_private'] == 0 and s['test_failed_generate'] == 0 and \ 122 | s['test_timeout_private'] == 0 and s['test_timeout_generate'] == 0 and \ 123 | (s['test_passed_private'] + s['test_passed_generate']) > 0: 124 | return True 125 | else: 126 | return False 127 | -------------------------------------------------------------------------------- /alpha_codium/gen/generators.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import functools 3 | 4 | from alpha_codium.llm.ai_handler import AiHandler 5 | from alpha_codium.llm.ai_invoker import send_inference 6 | 7 | 8 | class SimplePrompt: 9 | def __init__(self, system_prompt="", temperature=0.2, frequency_penalty=0): 10 | self.system_prompt = system_prompt 11 | self.temperature = temperature 12 | self.frequency_penalty = frequency_penalty 13 | self.ai_handler = AiHandler() 14 | 15 | async def _run(self, model, user_prompt): 16 | response, finish_reason = await self.ai_handler.chat_completion( 17 | model=model, 18 | temperature=self.temperature, 19 | frequency_penalty=self.frequency_penalty, 20 | system=self.system_prompt, 21 | user=user_prompt, 22 | ) 23 | return response 24 | 25 | async def run(self, user_prompt): 26 | f = functools.partial(self._run, user_prompt=user_prompt) 27 | response = await send_inference(f) 28 | return response 29 | 30 | 31 | if __name__ == "__main__": 32 | p = SimplePrompt() 33 | asyncio.run(p.run("what is the capital city of Israel")) 34 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/indirect/run_analyze_and_fix_test_failure.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import difflib 3 | import functools 4 | import logging 5 | import yaml 6 | from alpha_codium.llm.ai_invoker import send_inference 7 | from alpha_codium.log import get_logger 8 | from alpha_codium.settings.config_loader import get_settings 9 | 10 | logger = get_logger(__name__) 11 | 12 | 13 | async def run_analyze_and_fix_test_failure(self, problem, error_str): 14 | counter_retry = 0 15 | while True: 16 | try: 17 | problem['error_str'] = error_str 18 | f = functools.partial(self._run, problem=problem, prompt=choose_prompt()) 19 | response_analyze_failure, _ = await send_inference(f) 20 | problem['error_str'] = '' 21 | 22 | response_analyze_failure = response_analyze_failure.rstrip("'` \n") # remove trailing spaces and newlines from yaml response 23 | if response_analyze_failure.startswith("```yaml"): 24 | response_analyze_failure = response_analyze_failure[8:] 25 | response_analyze_failure_yaml = yaml.safe_load(response_analyze_failure) 26 | problem['response_analyze_failure'] = response_analyze_failure 27 | code_recent_solution = response_analyze_failure_yaml['fixed_code'].rstrip("'` \n") 28 | 29 | # some cleaning 30 | if code_recent_solution .startswith("```python"): 31 | code_recent_solution= code_recent_solution[10:] 32 | elif code_recent_solution.startswith("python"): 33 | code_recent_solution = code_recent_solution[6:] 34 | try: 35 | ast.parse(code_recent_solution) 36 | except: 37 | code_recent_solution_fallback = '\n'.join(code_recent_solution.splitlines()[:-1]).rstrip("'` \n") 38 | try: 39 | ast.parse(code_recent_solution_fallback) 40 | code_recent_solution = code_recent_solution_fallback 41 | except: 42 | logger.error(f"Invalid code:\n{code_recent_solution}") 43 | return problem 44 | problem['code_recent_solution'] = code_recent_solution 45 | 46 | # diff patch 47 | diff = difflib.unified_diff(problem['code_prev_solution'].splitlines(keepends=True), 48 | problem['code_recent_solution'].splitlines(keepends=True)) 49 | # patch = ''.join(diff) 50 | # if get_settings().solve.reduce_verbose: 51 | # logger.debug(f"diff:\n{patch}") 52 | # else: 53 | # logger.info(f"diff:\n{patch}") 54 | 55 | return problem 56 | except Exception as e: 57 | logging.error(f"'analyze_and_fix_test_failure' stage, counter_retry {counter_retry}, Error: {e}") 58 | counter_retry += 1 59 | if counter_retry > 2: 60 | raise e 61 | 62 | def choose_prompt(): 63 | if get_settings().get("solve.use_direct_solutions", False): 64 | return "code_contests_prompt_analyze_and_fix_direct" 65 | else: 66 | return "code_contests_prompt_analyze_and_fix" 67 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/indirect/run_analyze_tests_failure.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | import functools 3 | import logging 4 | import yaml 5 | 6 | from alpha_codium.settings.config_loader import get_settings 7 | from alpha_codium.llm.ai_invoker import send_inference 8 | from alpha_codium.log import get_logger 9 | 10 | logger = get_logger(__name__) 11 | 12 | 13 | async def run_analyze_test_failure(self, problem,error_str): 14 | counter_retry = 0 15 | while True: 16 | try: 17 | problem['error_str'] = error_str 18 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompt_analyze_failure") 19 | response_analyze_failure, _ = await send_inference(f) 20 | problem['error_str'] = '' 21 | 22 | response_analyze_failure = response_analyze_failure.rstrip("'` \n") # remove trailing spaces and newlines from yaml response 23 | if response_analyze_failure.startswith("```yaml"): 24 | response_analyze_failure = response_analyze_failure[8:] 25 | problem['response_analyze_failure'] = response_analyze_failure 26 | response_analyze_failure_yaml = yaml.safe_load(response_analyze_failure) 27 | problem['what_went_wrong'] = response_analyze_failure_yaml['what_went_wrong'] 28 | problem['fixed_flow'] = response_analyze_failure_yaml['fixed_flow'] 29 | return problem 30 | except Exception as e: 31 | logging.error(f"'analyze_test_failure' stage, counter_retry {counter_retry}, Error: {e}") 32 | counter_retry += 1 33 | if counter_retry > 2: 34 | raise e -------------------------------------------------------------------------------- /alpha_codium/gen/stages/indirect/run_fix_code_from_tests_failure.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | import functools 3 | import logging 4 | from alpha_codium.settings.config_loader import get_settings 5 | from alpha_codium.llm.ai_invoker import send_inference 6 | from alpha_codium.log import get_logger 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | async def run_fix_code_from_tests_failure(self, problem,error_str): 12 | counter_retry = 0 13 | while True: 14 | try: 15 | problem['error_str'] = error_str 16 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompt_fix_solution") 17 | response_fixed_code, _ = await send_inference(f) 18 | problem['error_str'] = '' 19 | 20 | # some cleaning 21 | response_fixed_code = response_fixed_code.rstrip("'` \n") # remove trailing spaces and newlines from yaml response 22 | if response_fixed_code.startswith("```python"): 23 | response_fixed_code = response_fixed_code[10:] 24 | problem['code_recent_solution'] = response_fixed_code 25 | 26 | # diff patch 27 | diff = difflib.unified_diff(problem['code_prev_solution'].splitlines(keepends=True), 28 | response_fixed_code.splitlines(keepends=True)) 29 | # patch = ''.join(diff) 30 | # if get_settings().solve.reduce_verbose: 31 | # logger.debug(f"diff:\n{patch}") 32 | # else: 33 | # logger.info(f"diff:\n{patch}") 34 | 35 | return problem 36 | 37 | except Exception as e: 38 | logging.error(f"fix_code_from_tests_failure' stage, counter_retry {counter_retry}, Error: {e}") 39 | counter_retry += 1 40 | if counter_retry > 2: 41 | raise e 42 | 43 | 44 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/indirect/run_fix_self_reflect.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import functools 3 | import logging 4 | import yaml 5 | 6 | from alpha_codium.settings.config_loader import get_settings 7 | from alpha_codium.gen.utils import postprocess_response 8 | from alpha_codium.llm.ai_invoker import send_inference 9 | from alpha_codium.log import get_logger 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | async def run_validate_self_reflect(self, problem): 15 | try: 16 | logger.info("--validate reflection stage--") 17 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompts_validate_reflection") 18 | 19 | # inference 20 | response_validate_reflect, _ = await send_inference(f) 21 | response_validate_reflect = response_validate_reflect.rstrip("` \n") 22 | if response_validate_reflect.startswith("```yaml"): 23 | response_validate_reflect = response_validate_reflect[8:] 24 | try: 25 | response_validate_reflect_yaml = yaml.safe_load(response_validate_reflect) 26 | except yaml.YAMLError: 27 | response_validate_reflect = postprocess_response(response_validate_reflect) # try to include only the yaml part 28 | response_validate_reflect_yaml = yaml.safe_load(response_validate_reflect) 29 | 30 | # check number of tests 31 | actual_number_of_tests = len(problem['public_tests']['input']) 32 | calculated_number_of_tests = len(response_validate_reflect_yaml['fixed_tests_explanations']) 33 | if actual_number_of_tests != calculated_number_of_tests: 34 | raise (f"Error: number of tests in validate self-reflection ({calculated_number_of_tests}) " 35 | f"does not match the actual number of tests ({actual_number_of_tests})") 36 | 37 | problem['response_validate_self_reflect'] = response_validate_reflect 38 | problem['tests_explanations'] = response_validate_reflect_yaml['fixed_tests_explanations'] 39 | problem['tests_explanations_str'] = response_validate_reflect.split('tests_explanations:')[1] 40 | 41 | # re-order the public tests from easiest to hardest 42 | problem['public_tests']['original'] = copy.deepcopy(problem['public_tests']) 43 | problem['public_tests']['input'] = [t['input'] for t in problem['tests_explanations']] 44 | problem['public_tests']['output'] = [t['output'] for t in problem['tests_explanations']] 45 | problem['public_tests']['explanation'] = [t['explanation'] for t in problem['tests_explanations']] 46 | 47 | return problem 48 | except Exception as e: 49 | logging.error(f"Failed 'run_validate_self_reflect', Error: {e}") 50 | return problem 51 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/indirect/run_validate_ai_test.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | 4 | from alpha_codium.gen.utils import load_yaml 5 | from alpha_codium.llm.ai_invoker import send_inference 6 | from alpha_codium.log import get_logger 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | async def run_validate_ai_tests(self, problem): 12 | counter_retry = 0 13 | while True: 14 | try: 15 | logger.info("--validate ai tests stage--") 16 | 17 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompts_validate_ai_tests") 18 | response_problem_tests, _ = await send_inference(f) 19 | problem['problem_ai_tests'] = load_yaml(response_problem_tests, 20 | keys_fix_yaml=["input:", "output:", "explanation", "what_was_wrong:"])['tests'] 21 | 22 | # clean up and parse the response 23 | for p in problem['problem_ai_tests']: 24 | p['input'] = str(p['input']).replace('\\n', '\n') 25 | p['output'] = str(p['output']).replace('\\n', '\n') 26 | 27 | return problem 28 | except Exception as e: 29 | logging.error(f"'validate ai tests' stage, counter_retry {counter_retry}, Error: {e}") 30 | counter_retry += 1 31 | if counter_retry > 2: 32 | # raise e 33 | return problem 34 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_baseline.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | from alpha_codium.gen.utils import postprocess_response 4 | from alpha_codium.llm.ai_invoker import send_inference 5 | from alpha_codium.log import get_logger 6 | 7 | logger = get_logger(__name__) 8 | 9 | 10 | async def run_baseline(self, problem): 11 | try: 12 | logging.info("Using baseline prompt") 13 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompts_baseline") 14 | response_baseline, _ = await send_inference(f) 15 | recent_solution = postprocess_response(response_baseline) 16 | return recent_solution 17 | except Exception as e: 18 | logging.error(f"Error: {e}") 19 | exit(-1) 20 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_choose_best_solution.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | from alpha_codium.llm.ai_invoker import send_inference 4 | from alpha_codium.log import get_logger 5 | from alpha_codium.gen.utils import load_yaml 6 | from alpha_codium.settings.config_loader import get_settings 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | async def run_choose_best_solution(self, problem): 12 | counter_retry = 0 13 | while True: 14 | try: 15 | logger.info("--choose best solution stage--") 16 | 17 | # get settings 18 | f = functools.partial(self._run, problem=problem, prompt=choose_prompt()) 19 | 20 | # inference 21 | response_best_solution, _ = await send_inference(f) 22 | response_best_solution_yaml = load_yaml(response_best_solution, 23 | keys_fix_yaml=["name:", "content:", "why:", "- "]) 24 | 25 | # update best solution 26 | problem['s_best_solution'] = response_best_solution 27 | if 's_possible_solutions' in problem: 28 | problem['s_other_solutions'] = [] 29 | for solution in problem['s_possible_solutions']: 30 | if solution['name'] != response_best_solution_yaml['name']: 31 | problem['s_other_solutions'].append(solution) 32 | 33 | return problem 34 | except Exception as e: 35 | logging.error(f"'run_choose_best_solution' stage, counter_retry {counter_retry}, Error: {e}") 36 | counter_retry += 1 37 | if counter_retry > 2: 38 | raise e 39 | 40 | 41 | def choose_prompt(): 42 | if get_settings().get("solve.use_direct_solutions", False): 43 | return "code_contests_prompts_choose_best_solution_direct" 44 | else: 45 | return "code_contests_prompts_choose_best_solution" -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_evaluate_all_ai_tests.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | from alpha_codium.gen.stages.indirect.run_analyze_and_fix_test_failure import run_analyze_and_fix_test_failure 5 | from alpha_codium.gen.stages.run_tests import run_tests 6 | from alpha_codium.log import get_logger 7 | from alpha_codium.settings.config_loader import get_settings 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | async def run_evaluate_all_ai_tests(self, problem): 13 | try: 14 | logger.info("--iterate on all ai tests stage--") 15 | 16 | ai_tests = problem['problem_ai_tests'] 17 | max_allowed_calls = get_settings().get("ai_tests.max_allowed_calls", 6) 18 | 19 | # evaluate ai tests 20 | actual_number_of_calls = 0 21 | for i, test in enumerate(ai_tests): 22 | counter = 0 23 | test_inputs = test['input'] 24 | test_outputs = test['output'] 25 | if not isinstance(test_inputs, list): 26 | test_inputs = [test_inputs] 27 | test_outputs = [test_outputs] 28 | 29 | # run the solution on the tests 30 | problem, test_passed, non_empty_output, error_str, trace_str, tests_timeout, d_tot \ 31 | = run_tests(self, problem, counter, test_inputs, test_outputs) 32 | 33 | # we passed without changing the code. Add the test to the passed tests list 34 | if test_passed: 35 | if test_inputs not in problem['passed_tests']['inputs']: 36 | logger.info(f"Passed ai tests without code fixing. adding to passed tests list") 37 | problem['passed_tests']['inputs'] += test_inputs 38 | problem['passed_tests']['outputs'] += test_outputs 39 | else: 40 | # cap the number of calls to the ai 41 | if actual_number_of_calls >= max_allowed_calls: 42 | if i < len(ai_tests) - len(problem['public_tests']['input']): # don't skip public tests 43 | logger.error(f"Failed to pass ai test. reached max number of calls") 44 | continue 45 | 46 | logger.error(f"Failed to pass ai tests. trying to fix code") 47 | last_code_solution = copy.deepcopy(problem['code_recent_solution']) 48 | 49 | # run 'analyze_and_fix_test_failure' stage 50 | problem = await run_analyze_and_fix_test_failure(self, problem, error_str) 51 | actual_number_of_calls += 1 52 | 53 | problem, test_passed2, non_empty_output2, error_str2, trace_str2, tests_timeout2, d_tot2 \ 54 | = run_tests(self, problem, counter, test_inputs, test_outputs) 55 | 56 | if not test_passed2 and (not 'sandbox error: ' in error_str): 57 | logger.error(f"Failed to pass ai tests with fixed code.") 58 | problem['code_recent_solution'] = last_code_solution 59 | else: # we passed the test after fixing the code 60 | 61 | # running previous passed tests again to make sure we didn't break anything 62 | if problem['passed_tests']['inputs']: 63 | problem, all_passed_prev, non_empty_output, error_str, trace_str, tests_timeout, d_tot \ 64 | = run_tests(self, problem, counter, 65 | problem['passed_tests']['inputs'], 66 | problem['passed_tests']['outputs']) 67 | if not all_passed_prev: 68 | logger.error(f"The fix broke prev passed tests. reverting to last solution") 69 | problem['code_recent_solution'] = last_code_solution 70 | continue 71 | 72 | if test_passed2: 73 | logger.info(f"Fixed current test, and passed prev tests. using new solution") 74 | if test_inputs not in problem['passed_tests']['inputs']: 75 | problem['passed_tests']['inputs'] += test_inputs 76 | problem['passed_tests']['outputs'] += test_outputs 77 | else: 78 | logger.info(f"Code doesnt crash, but still fails the test. using new solution") 79 | 80 | return problem 81 | except Exception as e: 82 | logging.error(f"Error in 'run_evaluate_all_ai_tests': {e}") 83 | return problem 84 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_evaluate_public_tests.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | from alpha_codium.gen.stages.indirect.run_analyze_and_fix_test_failure import run_analyze_and_fix_test_failure 5 | from alpha_codium.gen.stages.indirect.run_analyze_tests_failure import run_analyze_test_failure 6 | from alpha_codium.gen.stages.indirect.run_fix_code_from_tests_failure import run_fix_code_from_tests_failure 7 | from alpha_codium.settings.config_loader import get_settings 8 | from alpha_codium.gen.stages.run_tests import run_tests 9 | from alpha_codium.log import get_logger 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | async def run_evaluate_public_tests(self, problem): 15 | counter_retry = 0 16 | while True: 17 | try: 18 | logger.info("--iterate on public tests stage--") 19 | 20 | # configurations 21 | problem['use_self_reflection_public'] = get_settings().get('public_tests.use_self_reflection', False) 22 | max_allowed_fixes = get_settings().get("public_tests.max_allowed_calls", 6) 23 | max_fixes_per_test = get_settings().get("public_tests.max_fixes_per_test", 3) 24 | if len(problem['public_tests']['input']) == 1: 25 | max_fixes_per_test += 1 26 | 27 | # evaluate on public tests one by one 28 | test_inputs_all = problem['public_tests']['input'] 29 | test_outputs_all = problem['public_tests']['output'] 30 | test_explanations_all = problem['tests_explanations'] 31 | all_passed_public = True 32 | number_of_llm_fixes = 0 33 | for test_inputs, test_outputs, test_explanation in zip(test_inputs_all, test_outputs_all, 34 | test_explanations_all): 35 | if not isinstance(test_inputs, list): 36 | test_inputs = [test_inputs] 37 | test_outputs = [test_outputs] 38 | problem['test_explanation_current'] = test_explanation 39 | problem['use_test_explanations_public'] = get_settings().get('public_tests.use_test_explanations', False) 40 | 41 | # loop to fix specific test 42 | counter_test = 0 43 | passed_specific_test = False 44 | last_code_solution = copy.deepcopy(problem['code_recent_solution']) 45 | best_solution = copy.deepcopy(problem['code_recent_solution']) 46 | best_d = float('inf') 47 | while not passed_specific_test: 48 | 49 | # run the code on the test 50 | problem, passed_specific_test, non_empty_output, error_str, trace_str, tests_timeout, d_tot \ 51 | = run_tests(self, problem, counter_test, test_inputs, test_outputs) 52 | 53 | # save the best solution so far 54 | if -1 < d_tot < best_d: 55 | if counter_test > 0: 56 | logger.info(f"Found better solution, d_tot: {d_tot}") 57 | best_solution = copy.deepcopy(problem['code_recent_solution']) 58 | best_d = d_tot 59 | 60 | # cap the number of calls to the ai 61 | if not passed_specific_test and number_of_llm_fixes >= max_allowed_fixes: 62 | logger.debug(f"Failed to pass public test. reached max number of calls") 63 | break 64 | 65 | # analyze the tests results 66 | counter_test += 1 67 | logger.info(f"counter: {counter_test}") 68 | if passed_specific_test: 69 | logger.info(f"Passed a public test after {counter_test-1} attempts") 70 | if test_inputs not in problem['passed_tests']['inputs']: 71 | problem['passed_tests']['inputs'] += test_inputs 72 | problem['passed_tests']['outputs'] += test_outputs 73 | break 74 | elif counter_test > max_fixes_per_test: 75 | logger.debug(f"Failed to pass public tests after {max_fixes_per_test} attempts") 76 | break 77 | elif not non_empty_output: 78 | logging.debug("Failed to pass public tests. actual_output is empty") 79 | problem['code_recent_solution'] = last_code_solution 80 | continue 81 | else: 82 | # tests run. save the last solution 83 | problem['code_prev_solution'] = copy.deepcopy(problem['code_recent_solution']) 84 | 85 | if not get_settings().get("public_tests.single_stage_fix", False): 86 | # run 'analyze_and_fix_test_failure' stage 87 | problem = await run_analyze_and_fix_test_failure(self, problem, error_str) 88 | else: 89 | # run 'analyze_test_failure' stage 90 | problem = await run_analyze_test_failure(self, problem, error_str) 91 | 92 | # run 'fix_code_from_tests_failure' stage 93 | problem = await run_fix_code_from_tests_failure(self, problem, error_str) 94 | number_of_llm_fixes += 1 95 | 96 | # evaluate previous tests that passed. if they fail, revert to last solution 97 | if problem['passed_tests']['inputs']: 98 | problem, passed_prev_test, non_empty_output, error_str, trace_str, tests_timeout, d_tot \ 99 | = run_tests(self, problem, counter_test, 100 | problem['passed_tests']['inputs'], 101 | problem['passed_tests']['outputs']) 102 | if not passed_prev_test: 103 | logger.error(f"The fix broke prev passed tests. reverting to last solution") 104 | problem['code_recent_solution'] = last_code_solution 105 | continue 106 | 107 | # if not passed_specific_test: 108 | # if problem['passed_tests']['inputs']: 109 | # logger.error(f"Public test - reverting to initial solution, where: '{problem['passed_tests']['inputs']}' passed") 110 | # problem['code_recent_solution'] = last_code_solution 111 | # else: # no solution passed so far. 112 | # pass 113 | # logger.error("No solution passed so far. continuing to next test") 114 | # # logger.error(f'Public test - Reverting to best solution so far, d_tot: {best_d}') 115 | # # problem['code_recent_solution'] = best_solution 116 | all_passed_public = all_passed_public and passed_specific_test 117 | 118 | if all_passed_public: 119 | logger.info(f"==================") 120 | logger.info(f"Passed all public tests") 121 | logger.info(f"==================") 122 | else: 123 | logger.info(f"==================") 124 | logger.info(f"Failed to pass all public tests") 125 | logger.info(f"==================") 126 | 127 | return problem 128 | except Exception as e: 129 | logging.error(f"'public tests' stage, counter_retry {counter_retry}, Error: {e}") 130 | counter_retry += 1 131 | if counter_retry > 2: 132 | raise e 133 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_generate_ai_test.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | 4 | from alpha_codium.gen.stages.indirect.run_validate_ai_test import run_validate_ai_tests 5 | from alpha_codium.gen.utils import load_yaml 6 | from alpha_codium.settings.config_loader import get_settings 7 | from alpha_codium.llm.ai_invoker import send_inference 8 | from alpha_codium.log import get_logger 9 | 10 | logger = get_logger(__name__) 11 | 12 | 13 | async def run_generate_ai_tests(self, problem): 14 | counter_retry = 0 15 | while True: 16 | try: 17 | logger.info("--generate ai tests stage--") 18 | 19 | # get settings 20 | validate_ai_tests = get_settings().get('generate_ai_tests.validate_ai_tests', False) 21 | problem['number_of_ai_tests'] = get_settings().get("generate_ai_tests.number_of_ai_tests", 8) 22 | problem['use_test_explanations_possible_solutions'] = get_settings().get('generate_ai_tests.use_test_explanations') 23 | 24 | # get prompt 25 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompts_generate_ai_tests") 26 | 27 | # inference 28 | response_problem_tests, _ = await send_inference(f) 29 | problem['problem_ai_tests'] = load_yaml(response_problem_tests, 30 | keys_fix_yaml=["input:", "output:", "explanation:"])['tests'] 31 | problem['problem_ai_simple_test'] = problem['problem_ai_tests'][0] 32 | 33 | if validate_ai_tests: 34 | problem = await run_validate_ai_tests(self, problem) 35 | 36 | # adding public tests to the beginning and end of the list, for the ai-iterate stage 37 | if get_settings().get('generate_ai_tests.add_public_tests_to_ai_tests', False): 38 | for public_input, public_output in zip(problem['public_tests']['input'], 39 | problem['public_tests']['output']): 40 | # to the beginning of the list 41 | problem['problem_ai_tests'].insert(0, {'input': public_input, 'output': public_output}) 42 | # to the end of the list 43 | problem['problem_ai_tests'].append({'input': public_input, 'output': public_output}) 44 | 45 | return problem 46 | except Exception as e: 47 | logging.error(f"'generate ai tests' stage, counter_retry {counter_retry}, Error: {e}") 48 | counter_retry += 1 49 | if counter_retry > 2: 50 | raise e 51 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_generate_possible_solutions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import functools 3 | import logging 4 | import yaml 5 | 6 | from alpha_codium.gen.utils import load_yaml 7 | from alpha_codium.settings.config_loader import get_settings 8 | from alpha_codium.llm.ai_invoker import send_inference 9 | from alpha_codium.log import get_logger 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | async def run_generate_possible_solutions(self, problem): 15 | counter_retry = 0 16 | while True: 17 | try: 18 | logger.info("--generate possible solutions stage--") 19 | if get_settings().get("solve.use_direct_solutions", False): 20 | return problem 21 | 22 | # get settings 23 | problem['max_num_of_possible_solutions'] = get_settings().get('possible_solutions.max_num_of_possible_solutions') 24 | problem['use_test_explanations_possible_solutions'] = get_settings().get('possible_solutions.use_test_explanations') 25 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompt_generate_possible_solutions") 26 | 27 | # inference 28 | response_possible_solutions, _ = await send_inference(f) 29 | response_possible_solutions_yaml = load_yaml(response_possible_solutions) 30 | 31 | if get_settings().get('possible_solutions.remove_bruce_force_solutions'): 32 | for i, s in enumerate(response_possible_solutions_yaml['possible_solutions']): 33 | if 'brute' in s['name'].lower(): 34 | response_possible_solutions_yaml['possible_solutions'].pop(i) 35 | response_possible_solutions = yaml.dump(response_possible_solutions_yaml, sort_keys=False, line_break="\n") 36 | break 37 | problem['s_possible_solutions'] = response_possible_solutions_yaml['possible_solutions'] 38 | problem['s_possible_solutions_str'] = response_possible_solutions.split('possible_solutions:')[1].strip() 39 | 40 | return problem 41 | except Exception as e: 42 | logging.error(f"'possible solutions' stage, counter_retry {counter_retry}, Error: {e}") 43 | counter_retry += 1 44 | if counter_retry > 2: 45 | raise e 46 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_initial_code_generation.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | from alpha_codium.settings.config_loader import get_settings 5 | from alpha_codium.gen.stages.run_initial_solve import run_initial_solve 6 | from alpha_codium.gen.stages.run_tests import run_tests 7 | from alpha_codium.log import get_logger 8 | 9 | logger = get_logger(__name__) 10 | 11 | 12 | async def run_initial_code_generation(self, problem): 13 | counter_retry = 0 14 | while True: 15 | try: 16 | logger.info("--run initial code generation stage--") 17 | 18 | max_attempts = get_settings().get('initial_code_generation.max_attempts', 5) 19 | counter = 0 20 | 21 | # set the public tests as input 22 | test_input = problem['public_tests']['input'] 23 | test_output = problem['public_tests']['output'] 24 | 25 | # generate an initial code, using the top solution from the previous stage 26 | problem = await run_initial_solve(self, problem) 27 | 28 | # run the solution on the selected tests 29 | problem, passed_tests, non_empty_output, error_str, trace_str, tests_timeout, d_tot \ 30 | = run_tests(self, problem, counter, test_input, test_output) 31 | 32 | best_solution = copy.deepcopy(problem['code_recent_solution']) 33 | best_d = float('inf') # distance to the correct solution 34 | 35 | # set the distance to the correct solution 36 | if -1 < d_tot < best_d: 37 | best_solution = copy.deepcopy(problem['code_recent_solution']) 38 | best_d = d_tot 39 | 40 | while not passed_tests: 41 | counter += 1 42 | if counter > max_attempts: 43 | logger.error(f"Failed to pass tests after {counter - 1} attempts. exiting the stage") 44 | break 45 | 46 | s_best_solution_original = problem['s_best_solution'] 47 | if counter > 1 and 's_possible_solutions' in problem: 48 | # give two attempts to the highest ranked solution 49 | problem['s_best_solution'] = problem['s_possible_solutions'][ 50 | counter % len(problem['s_possible_solutions'])] 51 | problem = await run_initial_solve(self, problem) 52 | problem['s_best_solution'] = s_best_solution_original 53 | 54 | problem, passed_tests, non_empty_output, error_str, trace_str, tests_timeout, d_tot \ 55 | = run_tests(self, problem, counter, test_input, test_output) 56 | 57 | if passed_tests: 58 | logger.info(f"Passed tests after {counter} attempts") 59 | break 60 | else: 61 | logger.info(f"Failed to pass tests after {counter} attempts, d: {d_tot}, best_d so far: {best_d}") 62 | 63 | # save the best solution so far 64 | if -1 < d_tot < best_d: 65 | best_solution = copy.deepcopy(problem['code_recent_solution']) 66 | best_d = d_tot 67 | 68 | # set the best solution 69 | if not passed_tests and best_d < float('inf'): 70 | logger.error(f'Reverting to best solution so far, d_tot: {best_d}') 71 | problem['code_recent_solution'] = best_solution 72 | 73 | return problem 74 | except Exception as e: 75 | logging.error(f"'initial code generation' stage, counter_retry {counter_retry}, Error: {e}") 76 | counter_retry += 1 77 | if counter_retry > 2: 78 | raise e 79 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_initial_solve.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | from alpha_codium.llm.ai_invoker import send_inference 4 | from alpha_codium.log import get_logger 5 | from alpha_codium.settings.config_loader import get_settings 6 | 7 | logger = get_logger(__name__) 8 | 9 | 10 | async def run_initial_solve(self, problem): 11 | counter_retry = 0 12 | while True: 13 | try: 14 | logger.info("--initial solve stage--") 15 | 16 | f = functools.partial(self._run, problem=problem, prompt=choose_prompt()) 17 | response_solve, _ = await send_inference(f) 18 | 19 | # clean up the response 20 | response_solve = response_solve.rstrip("` \n") 21 | if response_solve.startswith("```python"): 22 | response_solve = response_solve[10:] 23 | elif response_solve.startswith("python"): 24 | response_solve = response_solve[6:] 25 | 26 | # save the response 27 | problem['code_recent_solution'] = response_solve 28 | problem['code_prev_solution'] = response_solve 29 | return problem 30 | except Exception as e: 31 | logging.error(f"'initial solve' stage, counter_retry {counter_retry}, Error: {e}") 32 | counter_retry += 1 33 | if counter_retry > 2: 34 | raise e 35 | 36 | def choose_prompt(): 37 | if get_settings().get("solve.use_direct_solutions", False): 38 | return "code_contests_prompts_solve_direct" 39 | else: 40 | return "code_contests_prompts_solve" -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_self_reflect.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | import yaml 4 | 5 | from alpha_codium.gen.stages.indirect.run_fix_self_reflect import run_validate_self_reflect 6 | from alpha_codium.settings.config_loader import get_settings 7 | from alpha_codium.gen.utils import postprocess_response 8 | from alpha_codium.llm.ai_invoker import send_inference 9 | from alpha_codium.log import get_logger 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | async def run_self_reflect(self, problem): 15 | counter_retry = 0 16 | while True: 17 | try: 18 | logger.info("--reflection stage--") 19 | 20 | # get settings 21 | validate_self_reflection = get_settings().get('self_reflection.validate_self_reflection', False) 22 | actual_number_of_tests = len(problem['public_tests']['input']) 23 | problem['actual_number_of_tests'] = actual_number_of_tests 24 | f = functools.partial(self._run, problem=problem, prompt="code_contests_prompt_reflect") 25 | 26 | # inference 27 | response_reflect, _ = await send_inference(f) 28 | response_reflect = response_reflect.rstrip("` \n") 29 | if response_reflect.startswith("```yaml"): 30 | response_reflect = response_reflect[8:] 31 | try: 32 | response_reflect_yaml = yaml.safe_load(response_reflect) 33 | except yaml.YAMLError: 34 | response_reflect = postprocess_response(response_reflect) # try to include only the yaml part 35 | response_reflect_yaml = yaml.safe_load(response_reflect) 36 | 37 | # check number of tests 38 | actual_number_of_tests = len(problem['public_tests']['input']) 39 | calculated_number_of_tests = len(response_reflect_yaml['tests_explanations']) 40 | if actual_number_of_tests != calculated_number_of_tests: 41 | raise (f"Error: number of tests in self-reflection ({calculated_number_of_tests}) " 42 | f"does not match the actual number of tests ({actual_number_of_tests})") 43 | problem['response_reflect'] = response_reflect 44 | try: 45 | problem['self_reflection'] = '- ' + '\n- '.join(response_reflect_yaml['self_reflection']) 46 | if problem['self_reflection'].startswith('- - '): 47 | problem['self_reflection'] = problem['self_reflection'][2:] 48 | except: 49 | problem['self_reflection'] = response_reflect_yaml['self_reflection'] 50 | problem['tests_explanations'] = response_reflect_yaml['tests_explanations'] 51 | problem['tests_explanations_str'] = response_reflect.split('tests_explanations:')[1] 52 | 53 | # double validation self-reflection 54 | if validate_self_reflection: 55 | problem = await run_validate_self_reflect(self, problem) 56 | 57 | for s in problem['tests_explanations']: 58 | s['input'] = s['input'].replace('\\n', '\n') 59 | s['output'] = s['output'].replace('\\n', '\n') 60 | s['explanation'] = s['explanation'].replace('\\n', '\n') 61 | 62 | return problem 63 | except Exception as e: 64 | logging.error(f"'run_self_reflect' stage, counter_retry {counter_retry}, Error: {e}") 65 | counter_retry += 1 66 | if counter_retry > 2: 67 | raise e 68 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/run_tests.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | import numpy as np 4 | from alpha_codium.code_contests.eval.code_test_runners import eval_solution 5 | from alpha_codium.gen.utils import render_trace 6 | from alpha_codium.log import get_logger 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | def run_tests(self, problem, counter, test_inputs, test_outputs): 12 | try: 13 | # run the solution on the public tests 14 | logging.info(f"evaluating public tests. attempt {counter}") 15 | test_inputs, results = eval_solution(example=problem, 16 | prediction=problem['code_recent_solution'], 17 | test_inputs=test_inputs, 18 | test_outputs=test_outputs, ) 19 | 20 | # analyze the tests results 21 | error_str = trace_str = "" 22 | all_passed = True 23 | non_empty_output = True 24 | tests_timeout = False 25 | if str(results.compilation_result.program_status) == 'ProgramStatus.kTimeout': 26 | tests_timeout = True 27 | all_passed = False 28 | for i, t in enumerate(results.test_results): 29 | error_str += f"test input:\n{test_inputs[i]}\n" \ 30 | f"expected output:\n{t.expected_output}\n" 31 | if t.actual_output: 32 | error_str += f"code output:\n{t.actual_output}\n'Timeout, took too long to run next test'\n" 33 | else: 34 | error_str += f"code output:\n'Timeout, took too long to run the test'\n" 35 | elif str(results.test_results[0].program_status) == 'ProgramStatus.kFailed': 36 | logger.error("failed to run solution") 37 | error_str = results.test_results[0].sandbox_result 38 | trace_str = f"trace information:\n{render_trace(results.test_results[0].trace)}\n\n" 39 | all_passed = False 40 | else: # ProgramStatus.passed 41 | # initially assume all tests passed 42 | all_passed = True 43 | non_empty_output = True 44 | 45 | # build the error string 46 | error_str = "" 47 | trace_str = "" 48 | for i, t in enumerate(results.test_results): 49 | if str(t.program_status) == 'ProgramStatus.kTimeout': 50 | if t.actual_output.strip(): 51 | t.actual_output += "\nTimeout, took too long to run the next test" 52 | else: 53 | t.actual_output = 'Timeout, took too long to run' 54 | t.passed = False 55 | elif str(t.program_status) == 'ProgramStatus.kFailed': 56 | t.actual_output = t.sandbox_result 57 | t.passed = False 58 | error_str += f"test input:\n{test_inputs[i]}\n" \ 59 | f"expected output:\n{t.expected_output}\n" \ 60 | f"code output:\n{t.actual_output}\n" \ 61 | # f"====================\n====================\n" 62 | 63 | trace_str += f"trace:\n{render_trace(t.trace)}\n" \ 64 | f"====================\n====================\n" 65 | 66 | # if get_settings().code_tester.calc_trace: 67 | # logger.debug(f"trace_str:\n{trace_str}") 68 | 69 | # is_all_passed_public = actual_output == expected_output 70 | all_passed = all_passed and t.passed 71 | non_empty_output = non_empty_output and t.actual_output 72 | 73 | # calculate the distance between the expected and actual output 74 | d_tot = calc_distance_between_results(non_empty_output, tests_timeout, test_outputs, results) 75 | 76 | return problem, all_passed, non_empty_output, error_str, trace_str, tests_timeout, d_tot 77 | except Exception as e: 78 | logging.error(f"Error: {e}") 79 | exit(-1) 80 | 81 | def calc_distance_between_results(non_empty_output, tests_timeout, test_outputs, results): 82 | try: 83 | d_tot = float('inf') 84 | if non_empty_output and not tests_timeout: 85 | d_tot = 0 86 | for i in range(len(test_outputs)): 87 | # logger.info(f"test_outputs[i]:\n{test_outputs[i]}") 88 | # logger.info(f"results.test_results[i].stdout:\n{results.test_results[i].stdout}") 89 | expected = test_outputs[i].rstrip().split('\n') 90 | actual = results.test_results[i].stdout.rstrip().split('\n') 91 | try: 92 | t1 = np.array(list(map(float, actual))) 93 | t2 = np.array(list(map(float, expected))) 94 | if t1.size == 0: 95 | return float('inf') 96 | d_tot += np.sum(np.abs(t1 - t2)) 97 | except: 98 | t1 = np.array(actual) 99 | t2 = np.array(expected) 100 | if t1.size == 0: 101 | return float('inf') 102 | d_tot += np.sum(t1 != t2) 103 | except: 104 | d_tot = float('inf') 105 | return d_tot 106 | -------------------------------------------------------------------------------- /alpha_codium/gen/stages/utils.py: -------------------------------------------------------------------------------- 1 | from alpha_codium.log import get_logger 2 | 3 | logger = get_logger(__name__) 4 | 5 | 6 | def set_configurations(problem, iteration=0): 7 | # configurations 8 | problem = {k: problem.get(k) for k in ["name", "description", "public_tests"]} 9 | problem['iteration'] = iteration 10 | 11 | # initialize passed tests field 12 | problem['passed_tests'] = {} 13 | problem['passed_tests']['inputs'] = [] 14 | problem['passed_tests']['outputs'] = [] 15 | 16 | # shorter description, without the input-output examples 17 | if '\nExample\n' in problem['description']: 18 | problem['description_short'] = problem['description'].split('\nExample\n')[0].strip() 19 | elif '\nExamples\n' in problem['description']: 20 | problem['description_short'] = problem['description'].split('\nExamples\n')[0].strip() 21 | else: 22 | logger.info(f"could not split description to short description, description: {problem['description']}") 23 | problem['description_short'] = problem['description'] 24 | return problem -------------------------------------------------------------------------------- /alpha_codium/gen/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List 3 | 4 | import yaml 5 | 6 | from alpha_codium.code_contests.eval.code_test_runners import eval_solution 7 | from alpha_codium.settings.config_loader import get_settings 8 | from alpha_codium.log import get_logger 9 | 10 | logger = get_logger(__name__) 11 | 12 | 13 | def clip_string(s: str, max_lines: int = None): 14 | lines = s.split("\n") 15 | if max_lines is not None and 0 < max_lines < len(lines): 16 | logger.debug(f"clipping string from {len(lines)} to {max_lines}") 17 | half_lines = int(max_lines / 2) 18 | lines = ( 19 | lines[:half_lines] + 20 | [f"\n.... {len(lines) - max_lines} omitted lines ....\n"] + 21 | lines[-half_lines:] 22 | ) 23 | return "\n".join(lines) 24 | else: 25 | return s 26 | 27 | 28 | def render_trace(trace_data): 29 | if not trace_data: 30 | return '' 31 | 32 | max_trace_lines = get_settings().code_tester.get("max_trace_lines") 33 | trace_data = clip_string(trace_data, max_trace_lines) 34 | return trace_data 35 | 36 | 37 | def postprocess_response(response): 38 | response = str(response) 39 | if response.endswith("stop"): 40 | response = response[:-4] 41 | pattern = r'```\w*\n(.*?)```' 42 | matches = re.findall(pattern, response, re.DOTALL) 43 | if matches: 44 | response = matches[0] 45 | return response 46 | 47 | 48 | def evaluate_solution_on_subset(evaluation_test_type, problem, solution, silent=False, break_on_timeout=True): 49 | # evaluate solution 50 | test_results = None 51 | if evaluation_test_type: 52 | test_results = eval_solution(evaluation_test_type=evaluation_test_type, example=problem, prediction=solution, 53 | silent=silent, break_on_timeout=break_on_timeout) 54 | 55 | if test_results[1] == []: 56 | if not silent: 57 | logger.info("=====================================") 58 | logger.info("No tests") 59 | logger.info("=====================================") 60 | return test_results, 0, 0, 0 61 | 62 | if (hasattr(test_results[1], 'compilation_result') and 63 | test_results[1].compilation_result.program_status.name == 'kTimeout'): 64 | if not silent: 65 | logger.info("=====================================") 66 | logger.info("Timeout") 67 | logger.info("=====================================") 68 | return test_results, 0, 0, len(test_results[0]) 69 | 70 | test_passed = 0 71 | test_failed = 0 72 | test_timeout = 0 73 | if not problem[evaluation_test_type]['input']: 74 | logger.info(f"No {evaluation_test_type} for this problem") 75 | else: 76 | for test in test_results[1].test_results: 77 | if (hasattr(test, 'program_status') and test.program_status.name == 'kTimeout'): 78 | test_timeout += 1 79 | elif not test.passed: 80 | test_failed += 1 81 | else: 82 | test_passed += 1 83 | if not silent: 84 | logger.info("=====================================") 85 | logger.info(f"test_passed: {test_passed}, test_failed: {test_failed}, test_timeout: {test_timeout}") 86 | logger.info("=====================================") 87 | 88 | return test_results, test_passed, test_failed, test_timeout 89 | 90 | 91 | def evaluate_on_private_tests(evaluation_test_type, problem, solution, silent=True): 92 | # evaluate solution 93 | test_results = None 94 | if evaluation_test_type: 95 | test_results = eval_solution(evaluation_test_type=evaluation_test_type, example=problem, prediction=solution, silent=silent) 96 | 97 | test_passed = 0 98 | test_failed = 0 99 | test_timeout = 0 100 | 101 | if not test_results[1]: 102 | logger.info("No tests were run") 103 | return test_results, 0, 0 104 | 105 | for test in test_results[1].test_results: 106 | if test.program_status.name=='kTimeout': 107 | test_timeout += 1 108 | elif not test.passed: 109 | test_failed += 1 110 | else: 111 | test_passed += 1 112 | 113 | 114 | logger.info("=====================================") 115 | logger.info(f"test_passed: {test_passed}, test_failed: {test_failed}, test_timeout: {test_timeout}") 116 | logger.info("=====================================") 117 | 118 | return test_results, test_passed, test_failed, test_timeout 119 | 120 | 121 | def load_yaml(response_text: str, keys_fix_yaml: List[str] = []) -> dict: 122 | response_text = response_text.rstrip("` \n") 123 | response_text = response_text.removeprefix('```yaml').rstrip('`') 124 | try: 125 | data = yaml.safe_load(response_text) 126 | except Exception as e: 127 | data = try_fix_yaml(response_text, keys_fix_yaml=keys_fix_yaml) 128 | if not data: 129 | get_logger().info(f"Failed to parse AI YAML prediction: {e}") 130 | return data 131 | 132 | 133 | def try_fix_yaml(response_text: str, keys_fix_yaml: List[str] = []) -> dict: 134 | response_text_lines = response_text.split('\n') 135 | 136 | keys = keys_fix_yaml 137 | response_text_lines_copy = response_text_lines.copy() 138 | for i in range(0, len(response_text_lines_copy)): 139 | for key in keys: 140 | if response_text_lines_copy[i].strip().startswith(key) and not '|' in response_text_lines_copy[i]: 141 | response_text_lines_copy[i] = response_text_lines_copy[i].replace(f'{key}', 142 | f'{key} |-\n ') 143 | try: 144 | data = yaml.safe_load('\n'.join(response_text_lines_copy)) 145 | get_logger().info(f"Successfully parsed AI prediction after adding |-\n") 146 | return data 147 | except: 148 | raise "yaml parsing error" -------------------------------------------------------------------------------- /alpha_codium/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/alpha_codium/llm/__init__.py -------------------------------------------------------------------------------- /alpha_codium/llm/ai_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import litellm 5 | import openai 6 | from aiolimiter import AsyncLimiter 7 | from litellm import acompletion 8 | from litellm import RateLimitError 9 | from litellm.exceptions import APIError 10 | # from openai.error import APIError, RateLimitError, Timeout, TryAgain 11 | from retry import retry 12 | 13 | from alpha_codium.settings.config_loader import get_settings 14 | from alpha_codium.log import get_logger 15 | 16 | logger = get_logger(__name__) 17 | OPENAI_RETRIES = 5 18 | 19 | 20 | class AiHandler: 21 | """ 22 | This class handles interactions with the OpenAI API for chat completions. 23 | It initializes the API key and other settings from a configuration file, 24 | and provides a method for performing chat completions using the OpenAI ChatCompletion API. 25 | """ 26 | 27 | def __init__(self): 28 | """ 29 | Initializes the OpenAI API key and other settings from a configuration file. 30 | Raises a ValueError if the OpenAI key is missing. 31 | """ 32 | self.limiter = AsyncLimiter(get_settings().config.max_requests_per_minute) 33 | try: 34 | if "gpt" in get_settings().get("config.model").lower(): 35 | openai.api_key = get_settings().openai.key 36 | litellm.openai_key = get_settings().openai.key 37 | self.azure = False 38 | if "deepseek" in get_settings().get("config.model"): 39 | litellm.register_prompt_template( 40 | model="huggingface/deepseek-ai/deepseek-coder-33b-instruct", 41 | roles={ 42 | "system": { 43 | "pre_message": "", 44 | "post_message": "\n" 45 | }, 46 | "user": { 47 | "pre_message": "### Instruction:\n", 48 | "post_message": "\n### Response:\n" 49 | }, 50 | }, 51 | 52 | ) 53 | except AttributeError as e: 54 | raise ValueError("OpenAI key is required") from e 55 | 56 | @property 57 | def deployment_id(self): 58 | """ 59 | Returns the deployment ID for the OpenAI API. 60 | """ 61 | return get_settings().get("OPENAI.DEPLOYMENT_ID", None) 62 | 63 | @retry( 64 | exceptions=(AttributeError, RateLimitError), 65 | tries=OPENAI_RETRIES, 66 | delay=2, 67 | backoff=2, 68 | jitter=(1, 3), 69 | ) 70 | async def chat_completion( 71 | self, model: str, 72 | system: str, 73 | user: str, 74 | temperature: float = 0.2, 75 | frequency_penalty: float = 0.0, 76 | ): 77 | try: 78 | deployment_id = self.deployment_id 79 | if get_settings().config.verbosity_level >= 2: 80 | logging.debug( 81 | f"Generating completion with {model}" 82 | f"{(' from deployment ' + deployment_id) if deployment_id else ''}" 83 | ) 84 | 85 | async with self.limiter: 86 | logger.info("-----------------") 87 | logger.info("Running inference ...") 88 | logger.debug(f"system:\n{system}") 89 | logger.debug(f"user:\n{user}") 90 | if "deepseek" in get_settings().get("config.model"): 91 | response = await acompletion( 92 | model="huggingface/deepseek-ai/deepseek-coder-33b-instruct", 93 | messages=[ 94 | {"role": "system", "content": system}, 95 | {"role": "user", "content": user}, 96 | ], 97 | api_base=get_settings().get("config.model"), 98 | temperature=temperature, 99 | repetition_penalty=frequency_penalty+1, # the scale of TGI is different from OpenAI 100 | force_timeout=get_settings().config.ai_timeout, 101 | max_tokens=2000, 102 | stop=['<|EOT|>'], 103 | ) 104 | response["choices"][0]["message"]["content"] = response["choices"][0]["message"]["content"].rstrip() 105 | if response["choices"][0]["message"]["content"].endswith("<|EOT|>"): 106 | response["choices"][0]["message"]["content"] = response["choices"][0]["message"]["content"][:-7] 107 | else: 108 | response = await acompletion( 109 | model=model, 110 | deployment_id=deployment_id, 111 | messages=[ 112 | {"role": "system", "content": system}, 113 | {"role": "user", "content": user}, 114 | ], 115 | temperature=temperature, 116 | frequency_penalty=frequency_penalty, 117 | force_timeout=get_settings().config.ai_timeout, 118 | ) 119 | except (APIError) as e: 120 | logging.error("Error during OpenAI inference") 121 | raise 122 | except RateLimitError as e: 123 | logging.error("Rate limit error during OpenAI inference") 124 | raise 125 | except Exception as e: 126 | logging.error("Unknown error during OpenAI inference: ", e) 127 | raise APIError from e 128 | if response is None or len(response["choices"]) == 0: 129 | raise APIError 130 | resp = response["choices"][0]["message"]["content"] 131 | finish_reason = response["choices"][0]["finish_reason"] 132 | logger.debug(f"response:\n{resp}") 133 | logger.info('done') 134 | logger.info("-----------------") 135 | return resp, finish_reason 136 | -------------------------------------------------------------------------------- /alpha_codium/llm/ai_invoker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import traceback 3 | from typing import Callable, List 4 | 5 | from alpha_codium.settings.config_loader import get_settings 6 | 7 | 8 | async def send_inference(f: Callable): 9 | all_models = _get_all_models() 10 | all_deployments = _get_all_deployments(all_models) 11 | # try each (model, deployment_id) pair until one is successful, otherwise raise exception 12 | for i, (model, deployment_id) in enumerate(zip(all_models, all_deployments)): 13 | try: 14 | get_settings().set("openai.deployment_id", deployment_id) 15 | return await f(model) 16 | except Exception: 17 | logging.warning( 18 | f"Failed to generate prediction with {model}" 19 | f"{(' from deployment ' + deployment_id) if deployment_id else ''}: " 20 | f"{traceback.format_exc()}" 21 | ) 22 | if i == len(all_models) - 1: # If it's the last iteration 23 | raise # Re-raise the last exception 24 | 25 | 26 | def _get_all_models() -> List[str]: 27 | model = get_settings().config.model 28 | fallback_models = get_settings().config.fallback_models 29 | if not isinstance(fallback_models, list): 30 | fallback_models = [m.strip() for m in fallback_models.split(",")] 31 | all_models = [model] + fallback_models 32 | return all_models 33 | 34 | 35 | def _get_all_deployments(all_models: List[str]) -> List[str]: 36 | deployment_id = get_settings().get("openai.deployment_id", None) 37 | fallback_deployments = get_settings().get("openai.fallback_deployments", []) 38 | if not isinstance(fallback_deployments, list) and fallback_deployments: 39 | fallback_deployments = [d.strip() for d in fallback_deployments.split(",")] 40 | if fallback_deployments: 41 | all_deployments = [deployment_id] + fallback_deployments 42 | if len(all_deployments) < len(all_models): 43 | raise ValueError( 44 | f"The number of deployments ({len(all_deployments)}) " 45 | f"is less than the number of models ({len(all_models)})" 46 | ) 47 | else: 48 | all_deployments = [deployment_id] * len(all_models) 49 | return all_deployments 50 | -------------------------------------------------------------------------------- /alpha_codium/llm/token_handler.py: -------------------------------------------------------------------------------- 1 | from jinja2 import Environment, StrictUndefined 2 | from tiktoken import encoding_for_model, get_encoding 3 | 4 | from alpha_codium.settings.config_loader import get_settings 5 | 6 | 7 | def get_token_encoder(): 8 | return ( 9 | encoding_for_model(get_settings().config.model) 10 | if "gpt" in get_settings().config.model 11 | else get_encoding("cl100k_base") 12 | ) 13 | 14 | 15 | class TokenHandler: 16 | """ 17 | A class for handling tokens in the context of a pull request. 18 | 19 | Attributes: 20 | - encoder: An object of the encoding_for_model class from the tiktoken module. Used to encode strings and count the 21 | number of tokens in them. 22 | - limit: The maximum number of tokens allowed for the given model, as defined in the MAX_TOKENS dictionary in the 23 | pr_agent.algo module. 24 | - prompt_tokens: The number of tokens in the system and user strings, as calculated by the _get_system_user_tokens 25 | method. 26 | """ 27 | 28 | def __init__(self, message=None, vars: dict = {}, system="", user=""): # noqa: B006 29 | """ 30 | Initializes the TokenHandler object. 31 | 32 | Args: 33 | - pr: The pull request object. 34 | - vars: A dictionary of variables. 35 | - system: The system string. 36 | - user: The user string. 37 | """ 38 | self.encoder = get_token_encoder() 39 | if message is not None: 40 | self.prompt_tokens = self._get_system_user_tokens( 41 | message, self.encoder, vars, system, user 42 | ) 43 | 44 | def _get_system_user_tokens(self, message, encoder, vars: dict, system, user): 45 | """ 46 | Calculates the number of tokens in the system and user strings. 47 | 48 | Args: 49 | - message: The pull request object. 50 | - encoder: An object of the encoding_for_model class from the tiktoken module. 51 | - vars: A dictionary of variables. 52 | - system: The system string. 53 | - user: The user string. 54 | 55 | Returns: 56 | The sum of the number of tokens in the system and user strings. 57 | """ 58 | environment = Environment(undefined=StrictUndefined) 59 | system_prompt = environment.from_string(system).render(vars) 60 | user_prompt = environment.from_string(user).render(vars) 61 | system_prompt_tokens = len(encoder.encode(system_prompt)) 62 | user_prompt_tokens = len(encoder.encode(user_prompt)) 63 | return system_prompt_tokens + user_prompt_tokens 64 | 65 | def count_tokens(self, patch: str) -> int: 66 | """ 67 | Counts the number of tokens in a given patch string. 68 | 69 | Args: 70 | - patch: The patch string. 71 | 72 | Returns: 73 | The number of tokens in the patch string. 74 | """ 75 | return len(self.encoder.encode(patch, disallowed_special=())) 76 | -------------------------------------------------------------------------------- /alpha_codium/log/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import sys 4 | from enum import Enum 5 | 6 | from loguru import logger 7 | 8 | 9 | class LoggingFormat(str, Enum): 10 | CONSOLE = "CONSOLE" 11 | JSON = "JSON" 12 | 13 | 14 | def json_format(record: dict) -> str: 15 | return record["message"] 16 | 17 | 18 | def setup_logger(logger_path: str = "./example.log", 19 | level: str = "INFO", 20 | fmt: LoggingFormat = LoggingFormat.CONSOLE): 21 | level: int = logging.getLevelName(level.upper()) 22 | if type(level) is not int: 23 | level = logging.INFO 24 | 25 | fileHandler = logging.FileHandler(logger_path, mode='w') 26 | 27 | if fmt == LoggingFormat.JSON: 28 | logger.remove(None) 29 | logger.add( 30 | sys.stdout, 31 | level=level, 32 | format="{message}", 33 | colorize=False, 34 | serialize=True, 35 | ) 36 | elif fmt == LoggingFormat.CONSOLE: 37 | logger.remove(None) 38 | logger.add(sys.stdout, level=level, colorize=True) 39 | logger.add(fileHandler, level=logging.DEBUG) 40 | 41 | return logger 42 | 43 | 44 | def get_logger(*args, **kwargs): 45 | return logger 46 | -------------------------------------------------------------------------------- /alpha_codium/settings/.secrets_template.toml: -------------------------------------------------------------------------------- 1 | [openai] 2 | #key = "..." 3 | -------------------------------------------------------------------------------- /alpha_codium/settings/choose_best_solution_direct.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_choose_best_solution_direct] 2 | temperature = 0.3 3 | system = """\ 4 | To solve the problem, utilize an exhaustive high-computational approach like simulation, recursion, brute-force or other direct approach. Ignore the problem constraints regarding large input size. 5 | """ 6 | User="""\ 7 | You are given a code contest problem, self-reflection on the problem, and a solution concept. 8 | 9 | problem description: 10 | ========== 11 | {{description|trim}} 12 | ========== 13 | 14 | 15 | self-reflection on the problem: 16 | ============ 17 | {{ self_reflection|trim }} 18 | ============ 19 | 20 | 21 | solution concept: 22 | ========== 23 | 'To solve the problem, utilize an exhaustive high-computational approach like simulation, recursion, brute-force or other direct approach. Ignore the problem constraints regarding large input size.' 24 | ========== 25 | 26 | 27 | Using the inputs above, your goal is to present a full exhaustive solution to the code contest problem. 28 | The output must be a YAML object equivalent to type $ExhaustiveProblemSolution, according to the following Pydantic definitions: 29 | ===== 30 | class Test(BaseModel): 31 | input: str 32 | output: str 33 | 34 | class ExhaustiveProblemSolution(BaseModel): 35 | name: str = Field(description="The name of the best solution") 36 | content: str = Field(description="Describe in words content of the solution") 37 | problem_rules: str = Field(description="Describe the problem rules, in bullet points") 38 | problem_stopping_criteria: str = Field(description="Describe the stopping criteria problem") 39 | pseudo_code: str = Field(description="Describe a pseudo code of the solution. Be specific and detailed") 40 | ===== 41 | 42 | 43 | Example YAML output: 44 | ```yaml 45 | name: | 46 | ... 47 | content: | 48 | ... 49 | problem_rules: | 50 | ... 51 | problem_stopping_criteria: | 52 | ... 53 | pseudo_code: | 54 | ... 55 | ``` 56 | 57 | Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). 58 | 59 | Answer: 60 | ```yaml\ 61 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompt_analyze_and_fix.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompt_analyze_and_fix] 2 | temperature = 0.2 3 | system = """\ 4 | - You must divide the fixed code into small sub-functions, with meaningful names and functionality. Each function should be no longer than 10 lines of code. 5 | - The fixed code should be robust and general, and work for other input examples as well. 6 | - The fixed should be different from the original code, and not just a copy-paste of the original code. 7 | """ 8 | user="""\ 9 | {%- if use_self_reflection_public %} 10 | You are given a code contest problem, and a self-reflection on the problem: 11 | 12 | problem description: 13 | ============= 14 | {{ description|trim }} 15 | ============= 16 | 17 | 18 | self-reflection on the problem: 19 | ====== 20 | {{ self_reflection|trim }} 21 | ====== 22 | 23 | {%- else %} 24 | 25 | You are given a code contest problem: 26 | ============= 27 | {{ description_short|trim }} 28 | ============= 29 | 30 | {%- endif %} 31 | 32 | 33 | A Python code solution was generated for the problem: 34 | ============= 35 | {{ code_recent_solution|trim }} 36 | ============= 37 | 38 | 39 | However, when running the input-output example test, the code solution failed to produce the expected output, and gave the following error message: 40 | ============= 41 | {{ error_str|trim }} 42 | ============= 43 | 44 | 45 | Using all the information above, your goal is generate a fixed code solution, that will pass the test. 46 | 47 | The output must be a YAML object equivalent to type $FixedCode, according to the following Pydantic definitions: 48 | ===== 49 | class FixedCode(BaseModel): 50 | failed_tests: str = Field(description="List the input-output tests that failed. use the format [{input: .., expected_output: .., code_output: ..}, ...]") 51 | what_went_wrong: str = Field(description="Describe shortly, in bullet points, what went wrong with the previous solutions.") 52 | fixed_flow: str = Field(description="Describe a fixed flow leading to the correct solution, in pseudo-code. Be specific and detailed.") 53 | fixed_code: str = Field(description="A fixed code solution. Don't explain your answer. Just provide the code itself, and nothing else") 54 | ===== 55 | 56 | Example YAML output: 57 | ```yaml 58 | failed_tests: | 59 | ... 60 | what_went_wrong: | 61 | ... 62 | fixed_flow: | 63 | ... 64 | fixed_code: | 65 | ... 66 | ``` 67 | 68 | Additional guidelines for generating the fixed code: 69 | - Divide the generated code into small sub-functions, with meaningful names and functionality. Keep the code simple and short. 70 | - The fixed code should be robust and general, and work for other input examples as well 71 | - Double-check the new solution code, and make sure it is correct and complete. 72 | - The fixed code must have a reasonable runtime - less than three seconds on a modern computer, given the problem constraints for large input. 73 | - The code needs to be self-contained, and executable as-is. 74 | 75 | 76 | Answer: 77 | ```yaml 78 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompt_analyze_and_fix_direct.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompt_analyze_and_fix_direct] 2 | temperature = 0.2 3 | system = """\ 4 | - You must divide the new code into small sub-functions, with meaningful names and functionality. Variable names should also be meaningful. 5 | - The new code still needs to utilize an exhaustive approach like simulation, recursion, brute-force or direct solution. Ignore the problem constraints regarding large input size. 6 | - The new code should be different from the original code, and not just a copy-paste of the original code. 7 | """ 8 | user="""\ 9 | You are given a code contest problem. 10 | 11 | 12 | problem description: 13 | ================ 14 | {{ description_short|trim }} 15 | ================ 16 | 17 | 18 | A code solution was generated for the problem: 19 | ============= 20 | {{ code_recent_solution|trim }} 21 | ============= 22 | 23 | 24 | However, when running the input-output example test, the code solution failed to produce the expected output, and gave the following error message: 25 | ============= 26 | {{ error_str|trim }} 27 | ============= 28 | 29 | 30 | Using the information above, your goal is to generate a fixed Python code, that will correctly solve the problem. 31 | - The fixed code still needs to utilize an exhaustive approach like simulation, recursion, brute-force or direct solution. Ignore the problem constraints regarding large input size. 32 | - If possible, provide minor optimizations to the code, but this is not required. 33 | - Make sure the fixed code covers relevant edge cases of the problem. 34 | 35 | The output must be a YAML object equivalent to type $FixedCode, according to the following Pydantic definitions: 36 | ===== 37 | class FixedCode(BaseModel): 38 | failed_test: str = Field(description="list the input-output test that failed. use the format {input: .., expected_output: .., code_output: ..}") 39 | what_went_wrong: str = Field(description="explain what went wrong with the code solution") 40 | fixed_code: str = Field(description="A fixed code solution. Don't explain your answer. Just provide a fixed code, and nothing else") 41 | ===== 42 | 43 | Example YAML output: 44 | ```yaml 45 | failed_test: |- 46 | ... 47 | what_went_wrong: |- 48 | ... 49 | fixed_code: |- 50 | ... 51 | ``` 52 | 53 | Each YAML output MUST be after a newline, indented, with block scalar indicator ('|-'). 54 | 55 | 56 | Answer: 57 | ```yaml 58 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompt_analyze_failure.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompt_analyze_failure] 2 | temperature = 0.3 3 | system = """\ 4 | """ 5 | user="""\ 6 | You are given a code contest problem, and a self-reflection on the problem: 7 | 8 | 9 | problem description: 10 | ====== 11 | {{ description_short|trim }} 12 | ====== 13 | 14 | 15 | self-reflection on the problem: 16 | ====== 17 | {{ self_reflection|trim }} 18 | ====== 19 | 20 | 21 | A Python code solution was generated for the problem: 22 | ====== 23 | {{ code_recent_solution|trim }} 24 | ====== 25 | 26 | 27 | However, when running the following input example, the code solution above failed to produce the expected output: 28 | ====== 29 | {{ error_str|trim }} 30 | ====== 31 | 32 | {%- if use_test_explanations_public %} 33 | 34 | Here is an explanation of how the input should have led to the expected output: 35 | ====== 36 | {{ test_explanation_current|trim }} 37 | ====== 38 | {%- endif %} 39 | 40 | 41 | Your goal is to analyze the code solution and the error, and propose a fix so the code will produce the expected output for the provided test input. 42 | The fix should keep the solution robust, and work for all other input examples as well. 43 | Make sure the fix has a reasonable runtime - less than three seconds on a modern computer, given the problem constraints for large input. 44 | 45 | 46 | The output must be a YAML object equivalent to type $FixedSolution, according to the following Pydantic definitions: 47 | ====== 48 | class FixedSolution(BaseModel): 49 | failed_tests: str = Field(description="List the input-output tests that failed. use the format [{input: .., expected_output: .., code_output: ..}, ...]") 50 | what_went_wrong: str = Field(description="Explanation shortly, in words, what was the problem with the code solution, and how should it be fix. Be as specific as possible. Don't generate actuall code.") 51 | fixed_flow: str = Field(description="Describe, in bullet points, a fixed flow that will calculate the correct output. be specific and elaborate. Emphasize the fixed parts, and how they apply to getting the correct output") 52 | ====== 53 | 54 | 55 | Example YAML output: 56 | ```yaml 57 | failed_tests: | 58 | ... 59 | what_went_wrong: | 60 | ... 61 | fixed_flow: | 62 | ... 63 | ``` 64 | 65 | 66 | Answer: 67 | ```yaml 68 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_baseline.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_baseline] 2 | temperature = 0.3 3 | system= """\ 4 | """ 5 | user=""" 6 | You are given a code contest problem: 7 | 8 | problem description: 9 | ============= 10 | {{description}} 11 | ============= 12 | 13 | 14 | Your goal is to generate a valid Python code that correctly solves the problem. 15 | Make sure to fully address the problem goals, rules and constraints. 16 | The code should be robust and general, and work for other input examples as well, not just the one given in the problem description. 17 | 18 | guidelines: 19 | - Generate only code, without any additional explanations or comments. 20 | - Make sure to include all the necessary module imports, properly initialize the variables, and address the problem constraints. 21 | - The code needs to be self-contained, and executable as-is. 22 | 23 | The code output must follow this structure: 24 | ``` 25 | def f1(...): 26 | ... 27 | return ... 28 | 29 | def f2(...): 30 | ... 31 | return ... 32 | ... 33 | 34 | if __name__ == "__main__": 35 | ... 36 | ``` 37 | The code should read the input using the 'input()' method. Make sure to properly parse the input, according to the problem description. 38 | The output should be printed without additional words using the 'print()' method. 39 | 40 | answer: 41 | ```python 42 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_choose_best_solution.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_choose_best_solution] 2 | temperature = 0.2 3 | system = """\ 4 | """ 5 | 6 | User="""\ 7 | You are given a code contest problem, and a self-reflection on the problem: 8 | 9 | 10 | problem description: 11 | ======= 12 | {{description|trim}} 13 | ======= 14 | 15 | 16 | self-reflection on the problem: 17 | ======= 18 | {{ self_reflection|trim }} 19 | ======= 20 | 21 | 22 | Here is a list of {{ s_possible_solutions|length }} possible solutions to the problem: 23 | ======= 24 | {{ s_possible_solutions_str|trim }} 25 | ======= 26 | 27 | 28 | Using the inputs above, your goal is to choose the best solution to the code contest problem. 29 | Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a simple and robust manner. 30 | Make sure the chosen solution has a reasonable runtime - less than three seconds on a modern computer, given the problem constraints regarding large inputs. 31 | 32 | The output must be a YAML object equivalent to type $ProblemSolution, according to the following Pydantic definitions: 33 | ======= 34 | class Test(BaseModel): 35 | input: str 36 | output: str 37 | 38 | class ProblemSolution(BaseModel): 39 | name: str = Field(description="The name of the best solution") 40 | content: str = Field(description="The content of the best solution") 41 | why: str = Field(description="Shortly explain why is this the best solution") 42 | flow: List[str] = Field(description="Describe of the flow of the solution, in bullet points") 43 | problem_tests: List[Test] = Field("List the input-output examples that are provided in the problem description.") 44 | input_output_examples_flow: List[str] = Field(description="Describe, in bullet points, how the proposed flow will lead to getting the expected output for the provided input examples") 45 | ======= 46 | 47 | 48 | Example YAML output: 49 | ```yaml 50 | name: | 51 | ... 52 | content: | 53 | ... 54 | why: | 55 | ... 56 | flow: 57 | - | 58 | ... 59 | - | 60 | ... 61 | ... 62 | problem_tests: 63 | - input: | 64 | ... 65 | output: | 66 | ... 67 | input_output_examples_flow: 68 | - | 69 | ... 70 | - | 71 | ... 72 | ``` 73 | 74 | Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). 75 | 76 | Answer: 77 | ```yaml\ 78 | """ 79 | -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_fix_solution.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompt_fix_solution] 2 | temperature = 0.3 3 | system="""\ 4 | - You must divide the fixed code into small sub-functions, with meaningful names and functionality. Each function should be no longer than 10 lines of code. 5 | - The fixed code should be robust and general, and work for other input examples as well. 6 | - The fixed should be different from the original code, and not just a copy-paste of the original code. 7 | """ 8 | user="""\ 9 | You are given a code contest problem: 10 | ============= 11 | {{ description_short|trim }} 12 | ============= 13 | 14 | 15 | A previous Python solution code was generated for the problem: 16 | ============= 17 | {{ code_recent_solution|trim }} 18 | ============= 19 | 20 | 21 | However, when running the input-output example test, the code failed to produce the expected output: 22 | ===================================== 23 | Error message when running the 'solution code': 24 | ' 25 | {{ error_str|trim }} 26 | ' 27 | ===================================== 28 | 29 | 30 | We analyzed the error message, and concluded the following about the problem: 31 | ============= 32 | {{ what_went_wrong|trim }} 33 | ============= 34 | 35 | 36 | Here is a fixed flow, that a correct solution code should follow: 37 | ============= 38 | {{ fixed_flow|trim }} 39 | ============= 40 | 41 | 42 | Using the analysis above, you need to generate a fixed solution code, that will pass all the tests. 43 | Additional guidelines for generating the fixed code: 44 | - The fixed solution code must pass all the tests, and have a reasonable runtime - less than three seconds on a modern computer, under the problem constraints. 45 | - Make sure the new solution code generalizes to all possible input-output examples, not just the provided input-output examples. 46 | - You must divide the new solution code into small sub-functions, with meaningful names and functionality 47 | 48 | 49 | The code output must follow this structure: 50 | ```` 51 | def f1(...): 52 | ... 53 | return ... 54 | 55 | def f2(...): 56 | ... 57 | return ... 58 | ... 59 | 60 | if __name__ == "__main__": 61 | ... 62 | ``` 63 | The code should read the input using the 'input()' method. Make sure to properly parse the input, according to the problem description. 64 | The output should be printed without additional words using the 'print()' method. 65 | 66 | 67 | Answer: 68 | ```python 69 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_generate_ai_tests.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_generate_ai_tests] 2 | temperature = 0.2 3 | system = """\ 4 | """ 5 | 6 | User="""\ 7 | You are given a code contest problem and a self-reflection on the problem: 8 | 9 | 10 | problem description: 11 | ====== 12 | {{ description|trim }} 13 | ====== 14 | 15 | 16 | self-reflection on the problem: 17 | ====== 18 | {{ self_reflection|trim }} 19 | ====== 20 | 21 | {%- if use_test_explanations_possible_solutions %} 22 | 23 | 24 | Here are also explanations for the problem test cases: 25 | ============ 26 | {{ tests_explanations_str|trim }} 27 | ============ 28 | {%- endif %} 29 | 30 | 31 | Your task is to generate additional {{ number_of_ai_tests }} diverse input-output examples for the code contest problem. 32 | Try to cover cases that are not covered by the original tests. Also include a test for large inputs. 33 | The generated tests should be sorted by difficulty, from easiest to hardest. 34 | All the inputs should be valid, and the outputs are correct. Double check them, and validate they match the problem description and rules. 35 | 36 | The output must be a valid YAML object equivalent to type $ProblemTests, according to the following Pydantic definitions: 37 | ====== 38 | class Test(BaseModel): 39 | input: str 40 | output: str 41 | explanation: str = Field(description='Short explanation how we got the output from the input. Be specific') 42 | 43 | class ProblemTests(BaseModel): 44 | tests: List[Test] = Field(min_items={{number_of_ai_tests}}, max_items={{number_of_ai_tests}}) 45 | ====== 46 | 47 | 48 | Example YAML output: 49 | ```yaml 50 | tests: 51 | - input: | 52 | ... 53 | output: | 54 | ... 55 | explanation: | 56 | ... 57 | ... 58 | ``` 59 | 60 | Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). 61 | 62 | Answer: 63 | ```yaml\ 64 | """ 65 | -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_generate_possible_solutions.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompt_generate_possible_solutions] 2 | temperature = 0.3 3 | system= """\ 4 | Pay attention to small details and nuances in the problem description. 5 | """ 6 | user="""You are given a code contest problem, and a self-reflection on the problem: 7 | 8 | problem description: 9 | ===== 10 | {{description}} 11 | ===== 12 | 13 | 14 | self-reflection on the problem: 15 | ============ 16 | {{ self_reflection|trim }} 17 | ============ 18 | 19 | {%- if use_test_explanations_possible_solutions %} 20 | 21 | 22 | Here are also explanations for the problem test cases: 23 | ============ 24 | {{ tests_explanations_str|trim }} 25 | ============ 26 | {%- endif %} 27 | 28 | 29 | Your goal is to come up with possible solutions to the code contest problem. 30 | 31 | Guidelines: 32 | - Make sure each solution fully addresses the problem goals, constraints, examples, and notes. 33 | - Each solution must have reasonable runtime and memory complexity - less than three seconds on a modern computer, given the problem constraints for large inputs. 34 | - Double-check the solutions. Each possible solution must be able to generalize to additional test cases, not just the ones provided in the problem description. 35 | 36 | The output must be a YAML object equivalent to type $ProblemSolutions, according to the following Pydantic definitions: 37 | ====== 38 | class Solution(BaseModel): 39 | name: str = Field(description="The name of the solution") 40 | content: str = Field(description="A description of the solution") 41 | why_it_works: str = Field(description="Shortly explain why this solution correctly solves the problem. Be specific and detailed regarding the problem rules and goals.") 42 | labels: List[str] = Field(description="A list of labels for the solution. For example (partial list): binary search, dynamic programming, trees, combinatorics, dfs, bfs, graphs, greedy, math, data structures, geometry, number theory, two pointers, simulation, direct approach, probabilities, ...") 43 | complexity: str = Field(description="The complexity of the solution") 44 | 45 | 46 | class $ProblemSolutions(BaseModel): 47 | possible_solutions: List[Solution] = Field(max_items={{max_num_of_possible_solutions}}, description="A list of possible solutions to the problem. Make sure each solution fully addresses the problem rules and goals.") 48 | ====== 49 | 50 | 51 | Example YAML output: 52 | ```yaml 53 | possible_solutions: 54 | - name: | 55 | ... 56 | content: | 57 | ... 58 | why_it_works: | 59 | ... 60 | labels: 61 | - ... 62 | - ... 63 | complexity: | 64 | ... 65 | ``` 66 | 67 | Answer: 68 | ```yaml\ 69 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_reflect.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompt_reflect] 2 | temperature = 0.2 3 | system= """\ 4 | The self-reflection must cover every aspect of the problem. Pay attention to small details and nuances in the problem description. 5 | """ 6 | user="""You are given a code contest problem: 7 | 8 | problem name: '{{name}}' 9 | 10 | 11 | problem description: 12 | ===== 13 | {{description|trim}} 14 | ===== 15 | 16 | 17 | Given the code contest problem, you have two tasks: 18 | 1) Reflect on the problem, and describe it in your own words, in bullet points. Pay attention to small details, nuances, notes and examples in the problem description. 19 | 2) Explain how each provided example input leads to the corresponding output (in total {{ actual_number_of_tests }} examples are provided). 20 | Read carefully the problem description. Make sure the test explanations are consistent with them, and between themselves. 21 | The explanation must coherently and logically lead from the input to the output. Be as specific as possible. 22 | 23 | The output must be a YAML object equivalent to type $ProblemReflection, according to the following Pydantic definitions: 24 | ===== 25 | Class InputOutput(BaseModel): 26 | input: str 27 | output: str 28 | explanation: str = Field(description="Short explanation how the test input leads to the test output.") 29 | 30 | 31 | class ProblemReflection(BaseModel): 32 | self_reflection: str = Field(description="Describe the problem in your own words, in bullet points. Address the problem goals, inputs, outputs, rules, constraints, and other relevant details.") 33 | tests_explanations: list[InputOutput] = Field(max_items={{ actual_number_of_tests }}, description="List of explanations for each test case") 34 | ===== 35 | 36 | Example YAML output: 37 | ```yaml 38 | self_reflection: 39 | - | 40 | ... 41 | - | 42 | ... 43 | tests_explanations: 44 | - input: | 45 | ... 46 | output: | 47 | .. 48 | explanation: | 49 | ... 50 | ... 51 | ``` 52 | 53 | 54 | Answer: 55 | ```yaml 56 | """ 57 | -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_solve.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_solve] 2 | temperature = 0.3 3 | system= """\ 4 | - You must divide the generated code into small sub-functions, with meaningful names and functionality. Each function should be no longer than 10 lines of code. 5 | - Double-check the solution code. The generated solution must generalize to any valid input, and not just the provided examples. 6 | """ 7 | user="""\ 8 | You are given a code contest problem, and a self-reflection on the problem. 9 | 10 | 11 | problem description: 12 | ============= 13 | {{ description|trim }} 14 | ============= 15 | 16 | 17 | self-reflection on the problem: 18 | ====== 19 | {{ self_reflection|trim }} 20 | ====== 21 | 22 | 23 | Your goal is to generate a valid Python code that correctly solves the code contest problem, using the following algorithm: 24 | ============= 25 | {{ s_best_solution|trim }} 26 | ============= 27 | 28 | 29 | 30 | Guidelines: 31 | - You must divide the generated code into small sub-functions, with meaningful names and functionality. Variables names should also be meaningful. 32 | - Double-check the generated code. It should generalize to any valid input, and not just the provided examples. 33 | - Make sure to include all the necessary module imports, properly initialize the variables, and address the problem constraints. 34 | - The code needs to be self-contained, and executable as-is. 35 | 36 | 37 | 38 | The generated code must follow this structure: 39 | ``` 40 | def f1(...): 41 | ... 42 | return ... 43 | 44 | def f2(...): 45 | ... 46 | return ... 47 | ... 48 | 49 | if __name__ == "__main__": 50 | ... 51 | ``` 52 | The code should read the input using the 'input()' method. Make sure to properly parse the input, according to the problem description. 53 | The output should be printed without additional words using the 'print()' method. 54 | 55 | 56 | Answer: 57 | ```python 58 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_solve_direct.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_solve_direct] 2 | temperature = 0.3 3 | system= """\ 4 | - You must divide the generated code into small sub-functions, with meaningful names and functionality. Each function should be no longer than 10 lines of code. 5 | - The code should ignore the problem constraints for large inputs. 6 | """ 7 | user="""\ 8 | You are given a code contest problem, and a self-reflection on the problem. 9 | 10 | 11 | problem description: 12 | ============= 13 | {{ description|trim }} 14 | ============= 15 | 16 | 17 | self-reflection on the problem: 18 | ====== 19 | {{ self_reflection|trim }} 20 | ====== 21 | 22 | 23 | Your goal is to generate a valid Python code that correctly solves the code contest problem, using the following algorithm: 24 | ============= 25 | {{ s_best_solution|trim }} 26 | ============= 27 | 28 | 29 | 30 | Guidelines: 31 | - You must divide the generated code into small sub-functions, with meaningful names and functionality. Variables names should also be meaningful. 32 | - Double-check the solution code. Make sure to include all the necessary module imports, properly initialize the variables, and address the problem constraints. 33 | - The code needs to be self-contained, and executable as-is. Output only code, without any explanations or comments. 34 | 35 | 36 | 37 | The code output must follow this structure: 38 | ``` 39 | def f1(...): 40 | ... 41 | return ... 42 | 43 | def f2(...): 44 | ... 45 | return ... 46 | ... 47 | 48 | if __name__ == "__main__": 49 | ... 50 | ``` 51 | The code should read the input using the 'input()' method. Make sure to properly parse the input, according to the problem description. 52 | The output should be printed without additional words using the 'print()' method. 53 | 54 | 55 | Answer: 56 | ```python 57 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_validate_ai_tests.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_validate_ai_tests] 2 | temperature = 0.2 3 | system = """\ 4 | Your goal is to consider each AI-generated test, and make sure its output and its explanation are correct. Be critical - they could be wrong. 5 | guidelines: 6 | - Read carefully the problem description. Make sure the output and the explanations are consistent with them, and between themselves. 7 | - Make sure you understand problem constraints, rules, and examples. 8 | - The tests explanations must coherently and logically lead from the input to the output. 9 | """ 10 | 11 | User="""\ 12 | You are given a code contest problem and a self-reflection on the problem: 13 | 14 | 15 | problem description: 16 | ====== 17 | {{ description|trim }} 18 | ====== 19 | 20 | 21 | self-reflection on the problem: 22 | ====== 23 | {{ self_reflection|trim }} 24 | ====== 25 | 26 | 27 | Here are additional tests for the problem, generated by an AI: 28 | 29 | AI-generated tests: 30 | ============ 31 | {{ problem_ai_tests|trim }} 32 | ============ 33 | 34 | 35 | Your goal is to consider each AI-generated test, and make sure the output and the explanation are correct. Be critical - they could be wrong. 36 | 37 | Guidelines: 38 | - Read the problem description carefully. Make sure the output and the explanations are consistent with them, and between themselves. 39 | - The test explanations must coherently and logically lead from the input to the output. 40 | 41 | The output must be a YAML object equivalent to type $ProblemTests, according to the following Pydantic definitions: 42 | ===== 43 | Class Test(BaseModel): 44 | input: str 45 | output: str 46 | explanation: str = Field(description="Short explanation of how the input leads to the output.") 47 | 48 | class ProblemTests(BaseModel): 49 | tests: List[Test] = Field(min_items={{number_of_ai_tests}}, max_items={{number_of_ai_tests}}) 50 | ===== 51 | 52 | 53 | Example YAML output: 54 | ```yaml 55 | tests: 56 | - input: | 57 | ... 58 | output: | 59 | ... 60 | explanation: | 61 | ... 62 | ... 63 | ``` 64 | 65 | Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). 66 | 67 | Answer: 68 | ```yaml\ 69 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/code_contests_prompts_validate_reflection.toml: -------------------------------------------------------------------------------- 1 | [code_contests_prompts_validate_reflection] 2 | temperature = 0.2 3 | system = """\ 4 | """ 5 | 6 | User="""\ 7 | You are given a code contest problem, and ai-generated explanations of how each input example leads to the corresponding output: 8 | 9 | 10 | problem description: 11 | ============ 12 | {{description|trim}} 13 | ============ 14 | 15 | 16 | tests explanations: 17 | ============ 18 | {{ tests_explanations_str|trim }} 19 | ============ 20 | 21 | 22 | Your goal is to consider each test explanation, and make sure it is correct and complete. Be critical - the provided explanations may be wrong or incomplete. 23 | Read carefully the problem description. Make sure the test explanations are consistent with them, and between themselves. 24 | The explanations must coherently and logically lead from the input to the output, with the actual flow. Be specific as possible, and describe in detail how the input leads to the output. 25 | Pay attention to the problem constraints, and small details. 26 | 27 | 28 | The output must be a YAML object equivalent to type $InputOutputExplanation, according to the following Pydantic definitions: 29 | ===== 30 | Class InputOutput(BaseModel): 31 | input: str 32 | output: str 33 | explanation: str = Field(description="Short explanation of how the input leads to the output. Be specific as possible.") 34 | 35 | 36 | class $InputOutputExplanation(BaseModel): 37 | fixed_tests_explanations: list[InputOutput] = Field(max_items = {{ actual_number_of_tests }}) 38 | ===== 39 | 40 | 41 | Example YAML output: 42 | ```yaml 43 | fixed_tests_explanations: 44 | - input: | 45 | ... 46 | output: | 47 | .. 48 | explanation: | 49 | ... 50 | ... 51 | ``` 52 | 53 | Answer: 54 | ```yaml\ 55 | """ -------------------------------------------------------------------------------- /alpha_codium/settings/config_loader.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from os import listdir 3 | from os.path import abspath, dirname, join, isfile 4 | import glob 5 | 6 | from dynaconf import Dynaconf 7 | 8 | PR_AGENT_TOML_KEY = "pr-agent" 9 | 10 | current_dir = dirname(abspath(__file__)) 11 | # setting_dir = join(current_dir, "settings") 12 | setting_dir = current_dir 13 | 14 | 15 | 16 | toml_files = list(pathlib.Path(join(setting_dir)).glob('*.toml')) # includes hidden files 17 | global_settings = Dynaconf( 18 | envvar_prefix=False, 19 | merge_enabled=True, 20 | settings_files=toml_files, 21 | ) 22 | 23 | 24 | def get_settings(): 25 | return global_settings 26 | -------------------------------------------------------------------------------- /alpha_codium/settings/configuration.toml: -------------------------------------------------------------------------------- 1 | [config] 2 | model="gpt-4-0125-preview" 3 | # model="gpt-4o-2024-05-13" 4 | # model="gpt-4-0613" 5 | # model="gpt-3.5-turbo-16k" 6 | frequency_penalty=0.1 7 | ai_timeout=90 # seconds 8 | fallback_models =[] 9 | verbosity_level=0 # 0,1,2 10 | private_dataset_cache_dir="~/.cache/huggingface/datasets/alpha_codium" 11 | max_requests_per_minute=60 12 | 13 | [dataset] 14 | evaluate_prev_solutions=false 15 | num_iterations=1 # X iterations to try to solve the problem 16 | use_iteration_scheme=true 17 | 18 | [solve] 19 | reduce_verbose = false 20 | use_baseline = false 21 | use_direct_solutions=false 22 | 23 | [self_reflection] 24 | validate_self_reflection=false 25 | 26 | [possible_solutions] 27 | max_num_of_possible_solutions=3 28 | use_test_explanations=true 29 | remove_bruce_force_solutions=true 30 | 31 | [generate_ai_tests] 32 | validate_ai_tests=false 33 | number_of_ai_tests=6 34 | use_test_explanations=true 35 | add_public_tests_to_ai_tests=true 36 | 37 | [initial_code_generation] 38 | max_attempts=8 39 | 40 | [public_tests] 41 | max_allowed_calls=4 42 | max_fixes_per_test=3 43 | use_test_explanations=false 44 | single_stage_fix=true 45 | use_self_reflection=false 46 | 47 | [ai_tests] 48 | max_allowed_calls=4 49 | 50 | [code_tester] 51 | tester_type="local" # local, code_contests 52 | order_matters=true 53 | sandbox=true 54 | delta=0.0001 55 | # trace 56 | calc_trace=false 57 | use_trace=false 58 | max_trace_lines=50 59 | trace_depth=4 60 | 61 | [code_contests_tester] 62 | stop_on_first_failure = false 63 | timeout = 3 64 | path_to_python_bin = "./venv/bin/python3.9" 65 | path_to_python_lib = ["./venv/lib", "./venv/lib/python3.9"] 66 | -------------------------------------------------------------------------------- /alpha_codium/solve_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from alpha_codium.gen.dataset_solver import solve_dataset 4 | from alpha_codium.log import get_logger, setup_logger 5 | 6 | logger = get_logger(__name__) 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--dataset_name", type=str, default="valid_and_test_processed") 10 | parser.add_argument("--split_name", type=str, default="valid") 11 | parser.add_argument("--database_solution_path", type=str, default="") 12 | if __name__ == "__main__": 13 | args = parser.parse_args() 14 | setup_logger() 15 | 16 | # set default database_solution_path 17 | args.database_solution_path = args.database_solution_path 18 | if not args.database_solution_path: 19 | args.database_solution_path = f"./{args.dataset_name}_{args.split_name}_solution_database.json" 20 | logger.info(f"args.database_solution_path: {args.database_solution_path}") 21 | 22 | solve_dataset(dataset_name=args.dataset_name, 23 | split_name=args.split_name, 24 | database_solution_path=args.database_solution_path) 25 | -------------------------------------------------------------------------------- /alpha_codium/solve_my_problem.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from alpha_codium.gen.coding_competitor import solve_problem, solve_my_problem 5 | from alpha_codium.log import setup_logger 6 | from alpha_codium.settings.config_loader import get_settings 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--my_problem_json_file", type=str, default="my_problem_example.json") 10 | 11 | if __name__ == "__main__": 12 | args = parser.parse_args() 13 | setup_logger() 14 | 15 | with open(args.my_problem_json_file, "r") as my_problem: 16 | solve_my_problem(json.load(my_problem)) 17 | -------------------------------------------------------------------------------- /alpha_codium/solve_problem.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from alpha_codium.gen.coding_competitor import solve_problem 4 | from alpha_codium.log import setup_logger 5 | from alpha_codium.settings.config_loader import get_settings 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--dataset_name", type=str, default="valid_and_test_processed") 9 | parser.add_argument("--split_name", type=str, default="valid") 10 | parser.add_argument("--problem_number", type=int, default=0) 11 | parser.add_argument("--problem_name", type=str, default="") 12 | 13 | if __name__ == "__main__": 14 | args = parser.parse_args() 15 | setup_logger() 16 | solve_problem(dataset_name=args.dataset_name, 17 | split_name=args.split_name, 18 | problem_number=args.problem_number, 19 | problem_name=args.problem_name) 20 | -------------------------------------------------------------------------------- /docs/docs/CNAME: -------------------------------------------------------------------------------- 1 | qodo-flow-docs.qodo.ai -------------------------------------------------------------------------------- /docs/docs/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/docs/docs/assets/favicon.ico -------------------------------------------------------------------------------- /docs/docs/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/docs/docs/assets/logo.png -------------------------------------------------------------------------------- /docs/docs/assets/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 78 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 122 | 124 | 125 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 135 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /docs/docs/css/custom.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --md-primary-fg-color: #765bfa; 3 | --md-accent-fg-color: #AEA1F1; 4 | } 5 | .md-nav__title, .md-nav__link { 6 | font-size: 16px; 7 | } 8 | 9 | .md-tabs__link { 10 | font-size: 16px; 11 | } 12 | 13 | .md-header__title { 14 | font-size: 20px; 15 | margin-left: 0px !important; 16 | } -------------------------------------------------------------------------------- /docs/docs/index.md: -------------------------------------------------------------------------------- 1 | # AlphaCodium 2 | 3 | Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering. 4 | 5 | [Paper](https://arxiv.org/abs/2401.08500) | 6 | [Dataset](https://huggingface.co/datasets/talrid/CodeContests_valid_and_test_AlphaCodium/blob/main/codecontests_valid_and_test_processed_alpha_codium.zip) 7 | 8 | **Official Implementation:** 9 | 10 | Tal Ridnik, Dedy Kredo, Itamar Friedman 11 | 12 | ## Abstract 13 | 14 | Code generation problems differ from common natural language problems - they require matching the exact syntax of the target language, identifying happy paths and edge cases, paying attention to numerous small details in the problem spec, and addressing other code-specific issues and requirements. Hence, many of the optimizations and tricks that have been successful in natural language generation may not be effective for code tasks. 15 | 16 | In this work, we propose a new approach to code generation by LLMs, which we call AlphaCodium - a test-based, multi-stage, code-oriented iterative flow, that improves the performances of LLMs on code problems. 17 | 18 | We tested AlphaCodium on a challenging code generation dataset called CodeContests, which includes competitive programming problems from platforms such as Codeforces. The proposed flow consistently and significantly improves results. 19 | On the validation set, for example, GPT-4 accuracy (pass@5) increased from 19% with a single well-designed direct prompt to 44% with the AlphaCodium flow. 20 | 21 | Many of the principles and best practices we acquired in this work, we believe, are broadly applicable to general code generation tasks. 22 | 23 | ![Pre-processedf flow](https://github.com/Codium-ai/AlphaCodium/blob/main/pics/proposed_flow.png?raw=true) 24 | ![Iterations](https://github.com/Codium-ai/AlphaCodium/blob/main/pics/iterations.png?raw=true) 25 | 26 | ## Installation 27 | 28 | 1. setup a virtual environment and run: `pip install -r requirements.txt` 29 | 30 | 2. Duplicate the file `alpha_codium/settings/.secrets_template.toml`, rename it as `.secrets.toml`, and fill in your OpenAI API key: 31 | ``` 32 | [openai] 33 | key = "..." 34 | ``` 35 | 36 | 3. Download the processed CodeContest validation and test dataset from [hugging face](https://huggingface.co/datasets/talrid/CodeContests_valid_and_test_AlphaCodium/blob/main/codecontests_valid_and_test_processed_alpha_codium.zip), extract the zip file, and placed the extracted folder in the root of the project. 37 | 38 | ## How to run 39 | 40 | ### Configuration 41 | The file: `alpha_codium/settings/configuration.toml` contains the configuration for the project. 42 | In the `config` section you can choose the model you want to use ("gpt-4", "gpt-3.5-turbo-16k", or others). 43 | 44 | ### Solving a specific problem 45 | To solve a specific problem with AlphaCodium, from the root folder run: 46 | ``` 47 | python -m alpha_codium.solve_problem \ 48 | --dataset_name /path/to/dataset \ 49 | --split_name test \ 50 | --problem_number 0 51 | ``` 52 | - The `dataset_name` is the path to the dataset folder you downloaded in the installation step. 53 | 54 | - Note that the validation set contains 117 problems, and the test set contains 165 problems, so the `problem_number` parameter should be accordingly (zero-based) 55 | 56 | - The `split_name` can be either `valid` or `test`. 57 | 58 | - The following sections in the configuration file: 59 | `solve`, `self_reflection`,`possible_solutions`,`generate_ai_tests`,`initial_code_generation`,`public_tests`, `ai_tests` 60 | enable to adjust possible configurations for the different stages of the flow. 61 | 62 | - Each run logs the results to a file named `alpha_codium/example.log`. Reviewing the log file is a good way to understand what is going on in each stage of the flow. 63 | 64 | ![Example problem (test set, problem number 12)](https://github.com/Codium-ai/AlphaCodium/blob/main/pics/example_problem.png?raw=true) 65 | 66 | ### Solving the entire dataset 67 | to solve the entire dataset with AlphaCodium, from the root folder run: 68 | ``` 69 | python -m alpha_codium.solve_dataset \ 70 | --dataset_name /path/to/dataset \ 71 | --split_name test 72 | --database_solution_path /path/to/output/dir/dataset_output.json 73 | ``` 74 | 75 | - The `split_name` can be either `valid` or `test`. 76 | - `database_solution_path` is the path to the directory where the solutions will be saved. 77 | - The `dataset` section in the configuration file contains the configuration for the running and evaluation of a dataset. 78 | - Note that this is a long process, and it may take a few days to complete with large models (e.g. GPT-4) and several iterations per problem. 79 | - `dataset.num_iterations` defines the number of iterations for each problem (pass@K). For a large number of iterations, it is recommended to introduce some randomness and different options for each iteration to achieve top results. 80 | 81 | ### Running the evaluation 82 | 83 | Once you generate a solution for the entire dataset (valid or test), you can evaluate it by running: 84 | ``` 85 | python -m alpha_codium.evaluate_dataset\ 86 | --dataset_name /path/to/dataset\ 87 | --split_name test\ 88 | --database_solution_path /path/to/output/dir/dataset_output.json 89 | ``` 90 | 91 | ## Technical Q&A 92 | Aggregating some technical questions we received about this project: 93 | ___ 94 | **Q: How much time did you spend on "prompt engineering" compared to "flow engineering"?** 95 | 96 | **A:** Structured output almost completely eliminates the need for simple prompt engineering. 97 | We estimate that ~95% of the time we did more high-level design, reasoning, and injecting data at the correct places, ..., a.k.a. "flow engineering". 98 | ___ 99 | 100 | **Q: How do you know that there wasn't a data leakage?** 101 | 102 | **A:** The test set of CodeContests dataset comprises problems published after September 2021, while the GPT-4 model variant we used (gpt-4-0613) has a data cutoff of September 2021. Hence, there is no data leakage for GPT4, on the test set. 103 | For other models like DeepSeek, we cannot be sure. However, note that our [main result](https://github.com/Codium-ai/AlphaCodium/blob/main/pics/comparison.png?raw=true) is a comparison of "direct prompt" vs. "AlphaCodium flow". Data leakage would help both approaches, so the relative improvement of AlphaCodium flow is still valid. 104 | ___ 105 | 106 | **Q: Is this project relevant only to specific programming languages?** 107 | 108 | **A:** No. The proposed flow is language agnostic. We generated solutions in Python, but the flow can be applied to any language. 109 | ___ 110 | 111 | **Q: How did you manage the context window?** 112 | 113 | **A:** We used models with a context window of 8192 tokens, and we did not encounter cases where it did not suffice. 114 | However, we clearly observed that as the context we used in practice grows larger (let's say, above 4000 tokens), the model starts to "ignore" some of the information in the context. Hence, there is a clear tradeoff: 115 | - Injecting the results of previous stages into the context, may help the model to generate better code. 116 | - However, it may also cause the model to ignore specific details and nuances from the problem description. 117 | ___ 118 | 119 | **Q: Is this work "realistic" in terms of the number of LLM calls?** 120 | 121 | **A:** In comparison to AlphaCode, we do four orders of magnitude (!) fewer [calls](https://github.com/Codium-ai/AlphaCodium/blob/main/pics/computational_effort.png?raw=true) (per solution AlphaCodium does 15-20 calls). 122 | Yet we acknowledge that for some applications, this may still be too much, and more optimizations are needed. We however believe that many of the ideas and principles we acquired in this work are broadly applicable, even when the number of calls is further limited. 123 | ___ 124 | **Q: Why do you iterate only on the generated code, and not on the AI-generated tests?** 125 | 126 | **A:** For code problems in CodeContests, the tests are a list of input-output pairs. Hence, you don't really learn anything new when you "fix" a test - you just change its output to the prediction of the generated code. Instead of fixing tests, we preferred to always try and fix the code, while using "test anchors". (see the [paper](https://arxiv.org/abs/2401.08500) for more details). 127 | However, for other code generation tasks, where the tests are more complex and contain runnable code, iterating on the tests, in addition to iterating on the generated code, may be beneficial. 128 | 129 | 130 | ## Broader Applicability 131 | While this work presents results on CodeContests dataset, we believe that it has a broader applicability. 132 | 133 | First and foremost, we feel that the proposed AlphaCodium [flow](https://github.com/Codium-ai/AlphaCodium/blob/main/pics/proposed_flow.png?raw=true), with reasonable adjustments, can be used as a more general framework for other code generation tasks. 134 | 135 | Secondly, many of the design concepts, principles, and tricks we acquired in this work are broadly applicable as-is to any general code generation tasks. For example: 136 | - **YAML Structured output**: asking the model to generate an output in YAML format, equivalent to a given Pydantic class 137 | 138 | - **Semantic reasoning via bullet points analysis**: Bullet points analysis encourages an in-depth understanding of the problem, and forces the model to divide the output into logical semantic sections, leading to improved results 139 | 140 | - **LLMs do better when generating a modular code**: when asking the model to: `divide the generated code into small sub-functions, with meaningful names and functionality`, we observe a better-produced code, with fewer bugs, and higher success rates for the iterative fixing stages. 141 | 142 | - **Soft decisions with double validation**: with a double validation process, we add an extra step where, given the generated output, the model is asked to re-generate the same output, but correct it if needed 143 | 144 | - **Leave room for exploration**: since the model can be wrong, it’s better to avoid irreversible decisions, and leave room for exploration and code iterations with different possible solutions 145 | 146 | The list above is partial. See the [paper](https://arxiv.org/abs/2401.08500) for more details. The code provided [in the repo](https://github.com/Codium-ai/AlphaCodium/tree/main/alpha_codium/settings) can be used as a reference for better understanding the proposed concepts, and for applying them to other code generation tasks. 147 | 148 | 149 | ## Example Problem 150 | In this section, we present an example for a full problem from CodeContests dataset (test-set, problem 1), in order to demonstrate the complexity of the problems in the dataset, and the challenges they pose to LLMs. 151 | 152 | ``` 153 | problem name: '1575_B. Building an Amusement Park' 154 | 155 | problem description: 156 | Mr. Chanek lives in a city represented as a plane. He wants to build an amusement park in the shape of a circle of radius r. 157 | The circle must touch the origin (point (0, 0)). 158 | There are n bird habitats that can be a photo spot for the tourists in the park. The i-th bird habitat is at point p_i = (x_i, y_i). 159 | 160 | Find the minimum radius r of a park with at least k bird habitats inside. 161 | 162 | A point is considered to be inside the park if and only if the distance between p_i and the center of the park is less than or equal 163 | to the radius of the park. 164 | Note that the center and the radius of the park do not need to be integers. 165 | 166 | In this problem, it is guaranteed that the given input always has a solution with r ≤ 2 ⋅ 10^5. 167 | 168 | Input 169 | 170 | The first line contains two integers n and k (1 ≤ n ≤ 10^5, 1 ≤ k ≤ n) — the number of bird habitats in the city and the number of bird 171 | habitats required to be inside the park. 172 | The i-th of the next n lines contains two integers x_i and y_i (0 ≤ |x_i|, |y_i| ≤ 10^5) — the position of the i-th bird habitat. 173 | 174 | Output 175 | 176 | Output a single real number r denoting the minimum radius of a park with at least k bird habitats inside. It is guaranteed that the given 177 | input always has a solution with r ≤ 2 ⋅ 10^5. 178 | Your answer is considered correct if its absolute or relative error does not exceed 10^{-4}. 179 | Formally, let your answer be a, and the jury's answer be b. Your answer is accepted if and only if \frac{|a - b|}{max{(1, |b|)}} ≤ 10^{-4}. 180 | 181 | Examples 182 | 183 | Input 184 | 185 | 8 4 186 | -3 1 187 | -4 4 188 | 1 5 189 | 2 2 190 | 2 -2 191 | -2 -4 192 | -1 -1 193 | -6 0 194 | 195 | Output 196 | 197 | 3.1622776589 198 | 199 | 200 | Input 201 | 202 | 1 1 203 | 0 0 204 | 205 | 206 | Output 207 | 208 | 0.0000000000 209 | 210 | Note 211 | 212 | In the first example, Mr. Chanek can put the center of the park at (-3, -1) with radius √{10} ≈ 3.162. It can be proven this is the minimum r. 213 | ``` 214 | 215 | 216 | ## Acknowledgments 217 | Our process CodeContests dataset is based on the original [CodeContests](https://huggingface.co/datasets/deepmind/code_contests) dataset. 218 | We removed the train set (which is not relevant to our work) and did some post-processing and cleaning to the validation and test sets. 219 | 220 | 221 | ## Citation 222 | ``` 223 | @misc{ridnik2024code, 224 | title={Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering}, 225 | author={Tal Ridnik and Dedy Kredo and Itamar Friedman}, 226 | year={2024}, 227 | eprint={2401.08500}, 228 | archivePrefix={arXiv}, 229 | primaryClass={cs.LG} 230 | } 231 | ``` 232 | 233 | -------------------------------------------------------------------------------- /docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: AlphaCodium Documentation 2 | description: Documentation for AlphaCodium - From Prompt Engineering to Flow Engineering. 3 | repo_url: https://github.com/Codium-ai/AlphaCodium 4 | repo_name: Codium-ai/AlphaCodium 5 | 6 | nav: 7 | - 'index.md' 8 | 9 | theme: 10 | logo: assets/logo.svg 11 | favicon: assets/favicon.ico 12 | name: material 13 | icon: 14 | repo: fontawesome/brands/github 15 | features: 16 | - navigation.tabs 17 | - navigation.expand 18 | - navigation.path 19 | - navigation.top 20 | - navigation.tracking 21 | - navigation.indexes 22 | - search.suggest 23 | - search.highlight 24 | - content.tabs.link 25 | - content.code.annotation 26 | - content.code.copy 27 | - toc.integrate 28 | language: en 29 | custom_dir: overrides 30 | 31 | palette: 32 | - media: "(prefers-color-scheme)" 33 | toggle: 34 | icon: material/brightness-auto 35 | name: Switch to light mode 36 | - media: "(prefers-color-scheme: light)" 37 | scheme: default 38 | toggle: 39 | icon: material/toggle-switch-off-outline 40 | name: Switch to dark mode 41 | primary: custom 42 | accent: custom 43 | - media: "(prefers-color-scheme: dark)" 44 | scheme: slate 45 | toggle: 46 | icon: material/toggle-switch 47 | name: Switch to light mode 48 | primary: custom 49 | accent: custom 50 | 51 | plugins: 52 | - social 53 | - search 54 | 55 | extra: 56 | generator: false 57 | social: 58 | - icon: fontawesome/brands/github 59 | link: https://github.com/Codium-ai 60 | - icon: fontawesome/brands/discord 61 | link: https://discord.com/invite/SgSxuQ65GF 62 | - icon: fontawesome/brands/youtube 63 | link: https://www.youtube.com/@Codium-AI 64 | - icon: fontawesome/brands/linkedin 65 | link: https://www.linkedin.com/company/codiumai 66 | - icon: fontawesome/brands/twitter 67 | link: https://twitter.com/CodiumAI 68 | - icon: fontawesome/brands/instagram 69 | link: https://www.instagram.com/codiumai/ 70 | analytics: 71 | provider: custom 72 | property: ${{ secrets.GOOGLE_ANALYTICS_ID }} 73 | 74 | extra_css: 75 | - css/custom.css 76 | 77 | markdown_extensions: 78 | - pymdownx.highlight: 79 | anchor_linenums: true 80 | - pymdownx.inlinehilite 81 | - pymdownx.snippets 82 | - admonition 83 | - pymdownx.arithmatex: 84 | generic: true 85 | - footnotes 86 | - pymdownx.details 87 | - pymdownx.superfences 88 | - pymdownx.mark 89 | - attr_list 90 | - pymdownx.emoji: 91 | emoji_index: !!python/name:material.extensions.emoji.twemoji 92 | emoji_generator: !!python/name:materialx.emoji.to_svg 93 | - toc: 94 | title: On this page 95 | toc_depth: 3 96 | permalink: true 97 | 98 | copyright: | 99 | © 2024 CodiumAI 100 | -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block scripts %} 4 | {{ super() }} 5 | 6 | 7 | 9 | 10 | {% endblock %} -------------------------------------------------------------------------------- /docs/overrides/partials/footer.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Footer 7 | 80 | 81 | 82 | 83 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /docs/overrides/partials/integrations/analytics/custom.html: -------------------------------------------------------------------------------- 1 | 2 | 7 | -------------------------------------------------------------------------------- /my_problem_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Sorting Andi and Budi Books Problem", 3 | "description": "Andi and Budi were given an assignment to tidy up their bookshelf of n books. Each book is represented by the book title — a string s_i numbered from 1 to n, each with length m. Andi really wants to sort the book lexicographically ascending, while Budi wants to sort it lexicographically descending.\n\nSettling their fight, they decided to combine their idea and sort it asc-desc-endingly, where the odd-indexed characters will be compared ascendingly, and the even-indexed characters will be compared descendingly.\n\nA string a occurs before a string b in asc-desc-ending order if and only if in the first position where a and b differ, the following holds:\n\n * if it is an odd position, the string a has a letter that appears earlier in the alphabet than the corresponding letter in b; \n * if it is an even position, the string a has a letter that appears later in the alphabet than the corresponding letter in b. \n\nInput\n\nThe first line contains two integers n and m (1 ≤ n ⋅ m ≤ 10^6).\n\nThe i-th of the next n lines contains a string s_i consisting of m uppercase Latin letters — the book title. The strings are pairwise distinct.\n\nOutput\n\nOutput n integers — the indices of the strings after they are sorted asc-desc-endingly.\n\nExample\n\nInput\n\n\n5 2\nAA\nAB\nBB\nBA\nAZ\n\n\nOutput\n\n\n5 2 1 3 4\n\nNote\n\nThe following illustrates the first example.\n\n", 4 | "public_tests": { 5 | "input": [ 6 | "5 2\nAA\nAB\nBB\nBA\nAZ\n" 7 | ], 8 | "is_valid_test": null, 9 | "output": [ 10 | "5 2 1 3 4 \n" 11 | ] 12 | }, 13 | "private_tests": { 14 | "input": [], 15 | "is_valid_test": null, 16 | "output": [] 17 | }, 18 | "generated_tests": { 19 | "input": [ 20 | "5 2\nAA\nAB\nBB\nBA\nZA\n", 21 | "5 2\nAA\nAB\nCB\nBA\nAZ\n", 22 | "2 2\nAA\nAB\nCB\nBA\nAZ\n", 23 | "2 2\nAA\nBC\nCB\nAB\nAZ\n", 24 | "1 2\nAA\nAC\nBC\nBA\nZA\n", 25 | "5 2\nAA\nAB\nBC\nBA\nAZ\n", 26 | "3 2\nAA\nAB\nCB\nAB\nAZ\n", 27 | "4 2\nBA\nAC\nEA\nAA\nZ@\n", 28 | "3 2\nAA\nBA\nCB\nAB\nAZ\n", 29 | "4 2\nAA\nAC\nCB\nBA\nAZ\n", 30 | "4 2\nBA\nBC\nEA\nAB\nZ@\n", 31 | "3 2\nBA\nAD\nAB\nBA\nC[\n", 32 | "3 2\nAA\nBA\nBC\nAB\nAZ\n", 33 | "5 2\nAA\nBA\nCC\nBB\nAZ\n", 34 | "2 2\nAA\nAB\nCB\nAB\nAZ\n", 35 | "2 2\nAA\nAC\nCB\nAB\nAZ\n", 36 | "2 2\nAA\nBC\nCB\nAB\nZA\n", 37 | "2 2\nAA\nBC\nBC\nAB\nZA\n", 38 | "2 2\nAA\nAC\nBC\nAB\nZA\n", 39 | "2 2\nAA\nAC\nBC\nBA\nZA\n", 40 | "1 2\nAA\nAC\nBC\nAA\nZA\n", 41 | "1 2\nAA\nAC\nCB\nAA\nZA\n", 42 | "1 2\nAA\nAC\nCB\nAA\nZ@\n", 43 | "2 2\nAA\nAC\nCB\nAA\nZ@\n", 44 | "2 2\nAA\nAC\nCA\nAA\nZ@\n", 45 | "2 2\nAB\nAC\nCA\nAA\nZ@\n", 46 | "2 2\nAB\nAC\nCA\nBA\nZ@\n", 47 | "2 2\nAB\nAC\nCA\nAB\nZ@\n", 48 | "2 2\nAB\nAC\nCA\nAB\n@Z\n", 49 | "2 2\nAB\nAC\nCA\nAB\n@Y\n", 50 | "5 2\nAA\nAB\nBB\nBA\nZB\n", 51 | "2 2\nAA\nAD\nCB\nAB\nAZ\n", 52 | "2 2\nAA\nBC\nBB\nAB\nAZ\n", 53 | "2 2\nBA\nBC\nCB\nAB\nZA\n", 54 | "2 2\nAA\nBC\nBD\nAB\nZA\n", 55 | "2 2\nAA\nAC\nBC\nAB\n[A\n", 56 | "2 2\nAA\nAC\nBC\nBA\nAZ\n", 57 | "1 2\nAA\nAB\nBC\nBA\nZA\n", 58 | "1 2\nAA\nAC\nBC\nAA\nAZ\n", 59 | "1 2\nBA\nAC\nCB\nAA\nZ@\n", 60 | "2 2\nAA\nAC\nBC\nAA\nZ@\n", 61 | "2 2\nAA\nAC\nDA\nAA\nZ@\n", 62 | "2 2\nAB\nAC\nCA\nAA\n[@\n", 63 | "2 2\nAB\nCA\nCA\nBA\nZ@\n", 64 | "2 2\nAB\nAC\nAC\nAB\nZ@\n", 65 | "2 2\nAB\nAC\nBA\nAB\n@Z\n", 66 | "2 2\nAB\nCA\nCA\nAB\n@Y\n", 67 | "5 2\nAA\nAB\nCC\nBA\nAZ\n", 68 | "2 2\nBA\nBC\nBB\nAB\nZA\n", 69 | "2 2\nAA\nAC\nAC\nAB\n[A\n", 70 | "2 2\nAA\nAC\nCB\nBA\nAZ\n", 71 | "1 2\nAA\nAB\nBC\nBA\nAZ\n", 72 | "1 2\nAA\nAC\nBC\nAA\n@Z\n", 73 | "1 2\nBA\nAC\nCA\nAA\nZ@\n", 74 | "2 2\nAA\nAC\nBC\nAA\nZ?\n", 75 | "3 2\nAA\nAC\nDA\nAA\nZ@\n", 76 | "2 2\nBA\nAC\nCA\nAA\n[@\n", 77 | "1 2\nAB\nCA\nCA\nBA\nZ@\n", 78 | "2 2\nAB\nCA\nAC\nAB\nZ@\n", 79 | "2 2\nAB\nAC\nBA\nBB\n@Z\n", 80 | "2 2\nAB\nCB\nCA\nAB\n@Y\n", 81 | "3 2\nAA\nAB\nCC\nBA\nAZ\n", 82 | "1 2\nBA\nBC\nBB\nAB\nZA\n", 83 | "2 2\nAA\nAC\nAC\nAC\n[A\n", 84 | "2 2\nAA\nAB\nCB\nCA\nAZ\n", 85 | "1 2\nAA\nAB\nBC\nBA\nA[\n", 86 | "1 2\nAA\nAC\nBC\nAB\n@Z\n", 87 | "1 2\nBA\nAC\nCA\nAA\n@Z\n", 88 | "2 2\nAA\nAC\nBC\nAA\n?Z\n", 89 | "3 2\nAA\nAC\nEA\nAA\nZ@\n", 90 | "2 2\nBA\nAC\nAC\nAA\n[@\n", 91 | "1 2\nBB\nCA\nCA\nBA\nZ@\n", 92 | "2 2\nAB\nBA\nAC\nAB\nZ@\n", 93 | "2 2\nAB\nAC\nBA\nBB\n?Z\n", 94 | "2 2\nAB\nCB\nCA\nAC\n@Y\n", 95 | "1 2\nBA\nCB\nBB\nAB\nZA\n", 96 | "2 2\nAA\nAC\nAC\nAC\n[B\n", 97 | "1 2\nAA\nAC\nCB\nBA\nAZ\n", 98 | "2 2\nAA\nAB\nBC\nBA\nA[\n", 99 | "1 2\nAA\nAC\nCB\nAB\n@Z\n", 100 | "1 2\nBA\nAC\nCA\nAB\n@Z\n", 101 | "2 2\nAA\nAC\nBC\nA@\n?Z\n", 102 | "3 2\nBA\nAC\nEA\nAA\nZ@\n", 103 | "1 2\nBB\nAC\nCA\nBA\nZ@\n", 104 | "2 2\nAA\nAC\nBA\nBB\n?Z\n", 105 | "2 2\nBA\nCB\nCA\nAC\n@Y\n", 106 | "2 2\nBA\nCB\nBB\nAB\nZA\n", 107 | "1 2\nAA\nAC\nAC\nAC\n[B\n", 108 | "1 2\nAA\nAC\nCB\nBA\nZA\n", 109 | "2 2\nAA\nAB\nCB\nBA\nA[\n", 110 | "1 2\nAA\nAC\nCB\nAB\nZ@\n", 111 | "1 2\nBA\nAC\nCB\nAB\n@Z\n", 112 | "2 2\nBA\nAC\nBC\nA@\n?Z\n", 113 | "1 2\nBB\nAC\nCA\nAB\nZ@\n", 114 | "2 2\nAA\nBC\nBA\nBB\n?Z\n", 115 | "2 2\nBA\nBC\nCA\nAC\n@Y\n", 116 | "1 2\nAA\nAC\nAC\nAC\n[C\n", 117 | "2 2\nAA\nAC\nCB\nBA\nA[\n", 118 | "1 2\nAA\nAC\nDB\nAB\nZ@\n", 119 | "2 2\nBA\nAD\nBC\nA@\n?Z\n", 120 | "4 2\nBA\nAC\nEA\nAB\nZ@\n", 121 | "1 2\nCB\nAC\nCA\nAB\nZ@\n", 122 | "2 2\nAA\nBC\nBA\nBB\nZ?\n", 123 | "2 2\nBA\nBB\nCA\nAC\n@Y\n", 124 | "1 2\nAA\nAC\nAC\nAD\n[C\n", 125 | "2 2\nAA\nCA\nCB\nBA\nA[\n", 126 | "2 2\nBA\nAD\nBC\nA?\n?Z\n", 127 | "1 2\nCB\nCA\nCA\nAB\nZ@\n", 128 | "2 2\nAA\nBC\nBB\nBB\nZ?\n", 129 | "2 2\nBA\nBB\nCA\nCA\n@Y\n", 130 | "1 2\nAA\nCA\nAC\nAD\n[C\n", 131 | "2 2\nAA\nCA\nBB\nBA\nA[\n", 132 | "2 2\nBA\nAD\nBB\nA?\n?Z\n", 133 | "2 2\nBA\nBB\nCA\nDA\n@Y\n", 134 | "2 2\nAA\nCA\nBB\nAB\nA[\n", 135 | "2 2\nBA\nDA\nBB\nA?\n?Z\n", 136 | "2 2\nAB\nBB\nCA\nDA\n@Y\n", 137 | "2 2\nAA\nCA\nBB\nAA\nA[\n", 138 | "2 2\nBA\nAD\nBB\nA?\n?Y\n", 139 | "2 2\nBA\nBB\nCA\nD@\n@Y\n", 140 | "2 2\nAA\nDA\nBB\nBA\nA[\n", 141 | "2 2\nAB\nAD\nBB\nA?\n?Y\n", 142 | "2 2\nBA\nBB\nAC\nD@\n@Y\n", 143 | "2 2\nAA\nDA\nBB\nAB\nA[\n", 144 | "2 2\nAB\nAD\nBA\nA?\n?Y\n", 145 | "2 2\nAA\nDA\nAB\nAB\nA[\n", 146 | "2 2\nAB\nAD\nAB\nA?\n?Y\n", 147 | "2 2\nAA\nDA\nAB\nBA\nA[\n", 148 | "2 2\nAB\nAD\nCA\nA?\n?Y\n", 149 | "2 2\nAA\nDA\nAB\nBA\nB[\n", 150 | "2 2\nAB\nAD\nCA\nA>\n?Y\n", 151 | "2 2\nAA\nDA\nAB\nCA\nB[\n", 152 | "1 2\nAA\nDA\nAB\nCA\nB[\n", 153 | "1 2\nAA\nAD\nAB\nCA\nB[\n", 154 | "1 2\nAA\nAD\nAB\nAC\nB[\n", 155 | "1 2\nAA\nAD\nAB\nAC\nC[\n", 156 | "1 2\nAA\nAD\nAB\nCA\nC[\n", 157 | "1 2\nAA\nAD\nAB\nBA\nC[\n", 158 | "2 2\nAA\nAD\nAB\nBA\nC[\n", 159 | "2 2\nBA\nAD\nAB\nBA\nC[\n", 160 | "2 2\nBA\nAE\nAB\nBA\nC[\n", 161 | "2 2\nBA\nAE\nAB\nBA\n[C\n", 162 | "2 2\nAA\nAB\nCB\nAC\nAZ\n", 163 | "3 2\nAA\nAC\nCB\nAB\nAZ\n", 164 | "2 2\nAA\nBC\nDB\nAB\nAZ\n", 165 | "2 2\nAA\nBC\nCB\nBA\nZA\n", 166 | "2 2\nAA\nBC\nBC\nAC\nZA\n", 167 | "2 2\nAA\nAC\nCB\nAB\nZA\n", 168 | "2 2\nAA\nAC\nCC\nBA\nZA\n", 169 | "2 2\nAA\nCA\nBC\nBA\nZA\n", 170 | "1 2\nAA\nAB\nBC\nAA\nZA\n", 171 | "1 2\nAA\nAC\nCC\nAA\nZA\n", 172 | "1 2\nAA\nAD\nCB\nAA\nZ@\n", 173 | "2 2\nAA\nAC\nCB\n@A\nZ@\n", 174 | "2 2\nAB\nCA\nCA\nAB\nZ@\n", 175 | "2 2\nAA\nAC\nCA\nAB\nZ@\n", 176 | "2 2\nAB\nAC\nAC\nAB\n@Z\n", 177 | "2 2\nAB\nAC\nAC\nAB\n@Y\n", 178 | "2 2\nAA\nBC\nBD\nAB\nYA\n", 179 | "3 2\nAA\nAC\nBC\nAB\n[A\n", 180 | "2 2\nAA\nAC\nAC\nBA\nAZ\n", 181 | "1 1\nAA\nAC\nBC\nAA\nAZ\n", 182 | "1 2\nBA\nAC\nCB\nAA\n@Z\n", 183 | "2 2\nAB\nAC\nDA\nAA\nZ@\n", 184 | "2 2\nBA\nAC\nCB\nAA\n[@\n", 185 | "2 1\nAB\nCA\nCA\nBA\nZ@\n", 186 | "2 2\nBA\nAC\nCA\nAB\n@Z\n", 187 | "2 2\nAB\nBC\nBA\nAB\n@Z\n", 188 | "2 2\nAB\nCA\nAC\nAB\n@Y\n", 189 | "5 2\nAA\nAB\nCC\nBB\nAZ\n", 190 | "2 2\nBA\nCB\nBB\nAB\nAZ\n", 191 | "1 2\nAA\nAB\nBC\nBA\nAY\n", 192 | "1 2\nAA\nAC\nBC\nBA\n@Z\n", 193 | "2 2\nAB\nAC\nCB\nAA\n[@\n", 194 | "2 2\nAB\nCA\nAD\nAB\nZ@\n", 195 | "2 2\nAB\nCB\nCB\nAB\n@Y\n", 196 | "3 2\nAA\nAB\nCC\nBA\nAY\n", 197 | "1 2\nBA\nBC\nBB\nBB\nZA\n", 198 | "2 2\nAA\nCA\nAC\nAC\n[A\n", 199 | "2 2\nAA\nAB\nBC\nBA\nAZ\n", 200 | "1 2\nAA\nAB\nBB\nBA\nA[\n", 201 | "1 2\nAA\nAC\nBC\nAB\n@[\n", 202 | "1 2\nBA\nBC\nCA\nAA\nZ@\n", 203 | "2 2\nAA\nCA\nBC\nAA\n?Z\n", 204 | "3 2\nAB\nAC\nEA\nAA\nZ@\n", 205 | "2 2\nBA\nAC\nAC\nAA\nZ@\n", 206 | "1 2\nBB\nCA\nCA\nBA\n@Z\n", 207 | "2 2\nAB\nBA\nAC\nBA\nZ@\n", 208 | "2 2\nBB\nCB\nCA\nAC\n@Y\n", 209 | "1 2\nBA\nCB\nBB\nBA\nZA\n", 210 | "2 2\nBA\nAC\nAC\nAC\n[B\n", 211 | "1 2\nAA\nAC\nCB\nAA\nAZ\n", 212 | "2 2\nAA\nAB\nBC\nB@\nA[\n", 213 | "1 2\nAA\nAC\nCB\nAA\n@Z\n", 214 | "2 2\nAA\nAC\nCA\nBB\n?Z\n", 215 | "2 2\nAB\nCB\nBB\nAB\nZA\n", 216 | "1 2\nAA\nCA\nAC\nAC\n[B\n", 217 | "3 2\nAA\nAB\nCB\nBA\nA[\n", 218 | "1 1\nAA\nAC\nCB\nAB\nZ@\n", 219 | "1 2\nBA\nAC\nCC\nAB\n@Z\n" 220 | ], 221 | "is_valid_test": null, 222 | "output": [ 223 | "2 1 3 4 5\n", 224 | "5 2 1 4 3\n", 225 | "2 1\n", 226 | "1 2\n", 227 | "1\n", 228 | "5 2 1 3 4\n", 229 | "2 1 3\n", 230 | "2 4 1 3\n", 231 | "1 2 3\n", 232 | "2 1 4 3\n", 233 | "4 2 1 3\n", 234 | "2 3 1\n", 235 | "1 3 2\n", 236 | "5 1 4 2 3\n", 237 | "2 1\n", 238 | "2 1\n", 239 | "1 2\n", 240 | "1 2\n", 241 | "2 1\n", 242 | "2 1\n", 243 | "1\n", 244 | "1\n", 245 | "1\n", 246 | "2 1\n", 247 | "2 1\n", 248 | "2 1\n", 249 | "2 1\n", 250 | "2 1\n", 251 | "2 1\n", 252 | "2 1\n", 253 | "2 1 3 4 5\n", 254 | "2 1\n", 255 | "1 2\n", 256 | "2 1\n", 257 | "1 2\n", 258 | "2 1\n", 259 | "2 1\n", 260 | "1\n", 261 | "1\n", 262 | "1\n", 263 | "2 1\n", 264 | "2 1\n", 265 | "2 1\n", 266 | "1 2\n", 267 | "2 1\n", 268 | "2 1\n", 269 | "1 2\n", 270 | "5 2 1 4 3\n", 271 | "2 1\n", 272 | "2 1\n", 273 | "2 1\n", 274 | "1\n", 275 | "1\n", 276 | "1\n", 277 | "2 1\n", 278 | "2 1 3\n", 279 | "2 1\n", 280 | "1\n", 281 | "1 2\n", 282 | "2 1\n", 283 | "1 2\n", 284 | "2 1 3\n", 285 | "1\n", 286 | "2 1\n", 287 | "2 1\n", 288 | "1\n", 289 | "1\n", 290 | "1\n", 291 | "2 1\n", 292 | "2 1 3\n", 293 | "2 1\n", 294 | "1\n", 295 | "1 2\n", 296 | "2 1\n", 297 | "1 2\n", 298 | "1\n", 299 | "2 1\n", 300 | "1\n", 301 | "2 1\n", 302 | "1\n", 303 | "1\n", 304 | "2 1\n", 305 | "2 1 3\n", 306 | "1\n", 307 | "2 1\n", 308 | "1 2\n", 309 | "1 2\n", 310 | "1\n", 311 | "1\n", 312 | "2 1\n", 313 | "1\n", 314 | "1\n", 315 | "2 1\n", 316 | "1\n", 317 | "1 2\n", 318 | "2 1\n", 319 | "1\n", 320 | "2 1\n", 321 | "1\n", 322 | "2 1\n", 323 | "2 4 1 3\n", 324 | "1\n", 325 | "1 2\n", 326 | "2 1\n", 327 | "1\n", 328 | "1 2\n", 329 | "2 1\n", 330 | "1\n", 331 | "1 2\n", 332 | "2 1\n", 333 | "1\n", 334 | "1 2\n", 335 | "2 1\n", 336 | "2 1\n", 337 | "1 2\n", 338 | "1 2\n", 339 | "1 2\n", 340 | "1 2\n", 341 | "2 1\n", 342 | "2 1\n", 343 | "1 2\n", 344 | "2 1\n", 345 | "2 1\n", 346 | "1 2\n", 347 | "2 1\n", 348 | "1 2\n", 349 | "2 1\n", 350 | "1 2\n", 351 | "2 1\n", 352 | "1 2\n", 353 | "2 1\n", 354 | "1 2\n", 355 | "1\n", 356 | "1\n", 357 | "1\n", 358 | "1\n", 359 | "1\n", 360 | "1\n", 361 | "2 1\n", 362 | "2 1\n", 363 | "2 1\n", 364 | "2 1\n", 365 | "2 1\n", 366 | "2 1 3\n", 367 | "1 2\n", 368 | "1 2\n", 369 | "1 2\n", 370 | "2 1\n", 371 | "2 1\n", 372 | "1 2\n", 373 | "1\n", 374 | "1\n", 375 | "1\n", 376 | "2 1\n", 377 | "1 2\n", 378 | "2 1\n", 379 | "2 1\n", 380 | "2 1\n", 381 | "1 2\n", 382 | "2 1 3\n", 383 | "2 1\n", 384 | "1\n", 385 | "1\n", 386 | "2 1\n", 387 | "2 1\n", 388 | "1 2\n", 389 | "2 1\n", 390 | "1 2\n", 391 | "1 2\n", 392 | "5 2 1 4 3\n", 393 | "1 2\n", 394 | "1\n", 395 | "1\n", 396 | "2 1\n", 397 | "1 2\n", 398 | "1 2\n", 399 | "2 1 3\n", 400 | "1\n", 401 | "1 2\n", 402 | "2 1\n", 403 | "1\n", 404 | "1\n", 405 | "1\n", 406 | "1 2\n", 407 | "2 1 3\n", 408 | "2 1\n", 409 | "1\n", 410 | "1 2\n", 411 | "1 2\n", 412 | "1\n", 413 | "2 1\n", 414 | "1\n", 415 | "2 1\n", 416 | "1\n", 417 | "2 1\n", 418 | "1 2\n", 419 | "1\n", 420 | "2 1 3\n", 421 | "1\n", 422 | "1\n" 423 | ] 424 | } 425 | } -------------------------------------------------------------------------------- /pics/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/pics/comparison.png -------------------------------------------------------------------------------- /pics/computational_effort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/pics/computational_effort.png -------------------------------------------------------------------------------- /pics/example_problem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/pics/example_problem.png -------------------------------------------------------------------------------- /pics/iterations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/pics/iterations.png -------------------------------------------------------------------------------- /pics/proposed_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/pics/proposed_flow.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dynaconf==3.1.12 2 | fastapi==0.99.0 3 | PyGithub==1.59.* 4 | retry==0.9.2 5 | Jinja2==3.1.2 6 | tiktoken==0.5.2 7 | uvicorn==0.22.0 8 | pytest==7.4.0 9 | aiohttp==3.9.3 10 | atlassian-python-api==3.39.0 11 | GitPython==3.1.32 12 | PyYAML==6.0.1 13 | starlette-context==0.3.6 14 | boto3==1.28.25 15 | google-cloud-storage==2.10.0 16 | ujson==5.8.0 17 | azure-devops==7.1.0b3 18 | msrest==0.7.1 19 | ## 20 | openai 21 | litellm 22 | duckdb==0.9.2 23 | datasets 24 | notebook 25 | black 26 | evaluate 27 | click 28 | code_contests_tester==0.1.6 29 | aiolimiter 30 | Jinja2 31 | tqdm 32 | pysnooper 33 | loguru 34 | numpy 35 | retry 36 | pydantic>=2.8.2 37 | # uninstall ipython to catch breakpoints on debug with sandbox==false 38 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/tests/__init__.py -------------------------------------------------------------------------------- /tests/alpha_codium/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/tests/alpha_codium/__init__.py -------------------------------------------------------------------------------- /tests/alpha_codium/code_contests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/tests/alpha_codium/code_contests/__init__.py -------------------------------------------------------------------------------- /tests/alpha_codium/code_contests/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Codium-ai/AlphaCodium/eb7577dbe998ae7e55696264591ac3c5dde75638/tests/alpha_codium/code_contests/eval/__init__.py -------------------------------------------------------------------------------- /tests/alpha_codium/code_contests/eval/test_local_exec.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import inspect 3 | import io 4 | import math 5 | import os 6 | import sys 7 | import tempfile 8 | from contextlib import contextmanager 9 | from functools import partial 10 | from typing import Callable, List 11 | 12 | import pytest as pytest 13 | from pysnooper import snoop 14 | 15 | from alpha_codium.code_contests.eval.local_exec import MultiTestResult, ProgramStatus, execute_candidate_code 16 | from alpha_codium.code_contests.eval.tracer import snooper_kwargs 17 | 18 | timeout = 3 19 | 20 | 21 | @contextmanager 22 | def mock_input_output(mock_input_value): 23 | new_out = io.StringIO() 24 | new_in = io.StringIO(mock_input_value + '\n') 25 | 26 | old_out = sys.stdout 27 | old_in = sys.stdin 28 | 29 | sys.stdout = new_out 30 | sys.stdin = new_in 31 | 32 | yield new_out 33 | 34 | sys.stdout = old_out 35 | sys.stdin = old_in 36 | 37 | 38 | class SandboxCaseContainer: 39 | 40 | def __init__(self, f: Callable): 41 | self.f = f 42 | 43 | def execute_as_string(self, input: str, sandbox=False): 44 | return self.execute_as_str_inner([input], trace=False, sandbox=sandbox) 45 | 46 | def execute_as_string_with_tracing(self, input: str, sandbox=False): 47 | return self.execute_as_str_inner([input], trace=True, sandbox=sandbox) 48 | 49 | def execute_as_str_inner(self, inputs: List[str], trace=False, sandbox=False): 50 | check_program = self.get_body() 51 | f = partial(execute_candidate_code, candidate=check_program, inputs=inputs, test_id=self.f.__name__, 52 | timeout=timeout, sandbox=sandbox, snoop=trace) 53 | if sandbox: 54 | with tempfile.TemporaryDirectory() as temp_dir: 55 | os.chdir(temp_dir) 56 | result = f() 57 | else: 58 | result = f() 59 | 60 | return result 61 | 62 | def get_body(self): 63 | function_body = inspect.getsource(self.f) 64 | func_ast = ast.parse(function_body) 65 | func_def = [node for node in ast.walk(func_ast) if isinstance(node, ast.FunctionDef)][0] 66 | body = func_def.body 67 | lines = [ast.unparse(node).strip() for node in body] 68 | result = "\n".join(lines).strip() 69 | print(result) 70 | return result 71 | 72 | 73 | def io_solution(): 74 | x = input() 75 | print(x) 76 | 77 | 78 | def one_level_and_loop_solution(): 79 | def my_func(val): 80 | for i in range(val): 81 | print(i) 82 | 83 | x = int(input()) 84 | my_func(x) 85 | 86 | 87 | def multi_level_and_loop_solution(): 88 | def printer_inner(val): 89 | print(val) 90 | 91 | def printer(val): 92 | print("p") 93 | printer_inner(val) 94 | 95 | def my_func(val): 96 | for i in range(val): 97 | printer(i) 98 | 99 | x = int(input()) 100 | my_func(x) 101 | 102 | 103 | def recursion_solution(): 104 | def fibonacci(n): 105 | if n <= 0: 106 | return 0 107 | elif n == 1: 108 | return 1 109 | else: 110 | return fibonacci(n - 1) + fibonacci(n - 2) 111 | 112 | x = int(input()) 113 | fib = fibonacci(x) 114 | print(fib) 115 | 116 | 117 | def timeout_solution(): 118 | def sleeper(timeout): 119 | import time 120 | print(f"sleeping for {timeout + 1}") 121 | time.sleep(timeout + 1) 122 | 123 | timeout = int(input()) 124 | sleeper(timeout) 125 | 126 | 127 | def exception_solution(): 128 | def excepter(n): 129 | raise ValueError(f"test run cannot accept {n}") 130 | 131 | x = int(input()) 132 | excepter(x) 133 | 134 | 135 | def bad_import_solution(): 136 | print(math.sqrt(int(input()))) 137 | 138 | 139 | test_data = [ 140 | (io_solution, 'hello', 'hello'), # (function, input, expected output) 141 | (one_level_and_loop_solution, '4', '0\n1\n2\n3'), 142 | (multi_level_and_loop_solution, '4', 'p\n0\np\n1\np\n2\np\n3'), 143 | (recursion_solution, '4', '3'), 144 | ] 145 | 146 | run_types = ['regular', 'regular_with_tracing', 'as_string', 'as_string_with_tracing'] 147 | 148 | 149 | def data_id(test_case): 150 | f, input_, output_ = test_case 151 | return f"{f.__name__}-{hash(str(input_) + str(output_))}" 152 | 153 | sandbox_ids=["not-sandboxed", "sandboxed"] 154 | 155 | 156 | @pytest.mark.parametrize("run_type", run_types) 157 | @pytest.mark.parametrize("sandbox", [False, True], ids=sandbox_ids) 158 | @pytest.mark.parametrize("func, input, expected", test_data, ids=[data_id(case) for case in test_data]) 159 | def test_happy_paths(monkeypatch, func, input, expected, run_type, sandbox): 160 | def assert_passed_and_output(expected, result: MultiTestResult): 161 | assert len(result.test_results) == 1 162 | my_result = result.test_results[0] 163 | assert my_result.stdout == expected 164 | assert my_result.stderr == '' 165 | print("trace\n") 166 | print(my_result.trace) 167 | 168 | my_case = SandboxCaseContainer(func) 169 | if 'regular' in run_type: 170 | with mock_input_output(input) as captured_output: 171 | if 'regular_with_tracing' == run_type: 172 | with snoop(**snooper_kwargs): 173 | my_case.f() 174 | else: 175 | my_case.f() 176 | result = captured_output.getvalue().strip() 177 | assert expected == result 178 | 179 | elif run_type == 'as_string': 180 | res = my_case.execute_as_string(input, sandbox=sandbox) 181 | assert_passed_and_output(expected, res) 182 | 183 | elif run_type == 'as_string_with_tracing': 184 | res = my_case.execute_as_string_with_tracing(input, sandbox=sandbox) 185 | assert_passed_and_output(expected, res) 186 | 187 | 188 | test_exception_data = [ 189 | (timeout_solution, str(timeout), ProgramStatus.kTimeout, ''), 190 | (exception_solution, '1', ProgramStatus.kFailed, 'test run cannot accept 1'), 191 | (bad_import_solution, '1', ProgramStatus.kFailed, "NameError: name 'math' is not defined"), 192 | ] 193 | 194 | def exception_data_id(test_case): 195 | f, input_, status, _ = test_case 196 | return f"{f.__name__}-{str(status)}-{hash(input_)}" 197 | 198 | @pytest.mark.parametrize("run_type", run_types) 199 | @pytest.mark.parametrize("sandbox", [False, True], ids=sandbox_ids) 200 | @pytest.mark.parametrize("func, input, status, error_string", test_exception_data, 201 | ids=[exception_data_id(case) for case in test_exception_data]) 202 | def test_runtime_issues(monkeypatch, func, input, status, error_string, run_type, sandbox): 203 | def assert_status_and_error(result: MultiTestResult, status, err): 204 | assert len(result.test_results) == 1 205 | my_result = result.test_results[0] 206 | assert my_result.program_status == status 207 | assert err in my_result.sandbox_result 208 | print("trace") 209 | print(my_result.trace) 210 | print("=============") 211 | print("stack trace") 212 | print(my_result.sandbox_result) 213 | 214 | my_case = SandboxCaseContainer(func) 215 | 216 | if run_type == 'as_string': 217 | res = my_case.execute_as_string(input) 218 | assert_status_and_error(res, status, error_string) 219 | 220 | elif run_type == 'as_string_with_tracing': 221 | res = my_case.execute_as_string_with_tracing(input) 222 | assert_status_and_error(res, status, error_string) 223 | 224 | 225 | if __name__ == '__main__': 226 | timeout_solution() 227 | --------------------------------------------------------------------------------