├── .env.example ├── .gitignore ├── LICENSE ├── README.md ├── data ├── APPS │ └── selected150.jsonl ├── HumanEval │ ├── HumanEval.jsonl │ ├── HumanEvalET.jsonl │ └── HumanEvalIncreasedSampleIO.jsonl ├── MBPPEval │ ├── MBPP.jsonl │ ├── MBPP_ET.jsonl │ └── mbpp-py.jsonl └── xCodeEval │ ├── problem_descriptions.jsonl │ ├── prog_syn_val.jsonl │ └── unittest_db.json ├── images ├── CodeSim-Overview.png ├── basic-results.png ├── contest-results.png └── opensource-llm-results.png ├── requirements.txt └── src ├── constants ├── __init__.py ├── lang_mappings.py ├── paths.py └── verboseType.py ├── datasets ├── APPSDataset.py ├── CodeContestDataset.py ├── Dataset.py ├── DatasetFactory.py ├── HumanEvalDataset.py ├── MBPPDataset.py ├── XCodeDataset.py ├── __init__.py ├── convert-apps-xcode.py └── convert-cc-xcode.py ├── evaluations ├── __init__.py ├── api_comm.py ├── evalute.py ├── exec_outcome.py ├── executor_utils.py ├── func_evaluate.py ├── limits_by_lang.yaml └── resource_limit.py ├── main.py ├── models ├── Anthropic.py ├── Base.py ├── Gemini.py ├── GroqModel.py ├── ModelFactory.py ├── OpenAI.py └── __init__.py ├── promptings ├── Analogical.py ├── Base.py ├── CoT.py ├── CodeSIM.py ├── Direct.py ├── MapCoder.py ├── PromptingFactory.py ├── SelfPlanning.py ├── __init__.py └── variations │ ├── CodeSIMA.py │ ├── CodeSIMC.py │ ├── CodeSIMWD.py │ ├── CodeSIMWPV.py │ └── CodeSIMWPVD.py ├── results ├── Results.py └── __init__.py └── utils ├── __init__.py ├── evaluateET.py ├── generateEP.py ├── jsonl.py ├── parse.py ├── runEP.py ├── summary.py └── tokenCount.py /.env.example: -------------------------------------------------------------------------------- 1 | # configure the following for Azure 2 | API_TYPE="azure" 3 | AZURE_API_VERSION= 4 | AZURE_API_URL= 5 | AZURE_API_KEY= 6 | 7 | # for openai (uncomment API_TYPE="openai" and set OPENAI_API_KEY) 8 | # API_TYPE="openai" 9 | OPENAI_API_KEY= 10 | OPENAI_API_URL="https://api.openai.com/" 11 | 12 | # for gemini 13 | GEMINI_API_KEY= 14 | 15 | # for groq 16 | GROQ_API_KEY= 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __pycache__ 3 | **__pycache__ 4 | .vscode 5 | *workspace* 6 | 7 | samples 8 | desktop.ini 9 | logs 10 | 11 | test* 12 | 13 | results 14 | outputs 15 | 16 | costing*.csv 17 | usage*.csv 18 | 19 | .venv 20 | 21 | temp* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 KAGNLP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | # CodeSim: Multi-Agent Code Generation and Problem Solving through Simulation-Driven Planning and Debugging 7 | 8 |

9 | • 👨‍💻 Code 10 | • 📃 arXiv 11 | • 🤗 Hugging face 12 | • 📰 Papers With Code 13 | • 🌐 Website 14 |

15 | 16 | ## News 17 | 18 | - 📢 With o3-mini CodeSim achieved a new pass@1 score of 98.8% on HumanEval and became 2'nd in [PaperwithCode LeaderBoard](https://paperswithcode.com/sota/code-generation-on-humaneval). Note that this is even without using any external debuggers! Results log can be found [here](https://huggingface.co/ashraful/CodeSIM/tree/main/results/HumanEval/CodeSIM/o3-mini). 19 | - 🎉 CodeSim got accepted in NAACL 2025 Findings. 20 | - 📢 Full results log can be found [here](https://huggingface.co/ashraful/CodeSIM/tree/main/results). 21 | - 📢 Added CodeSim outperforming MapCoder. 22 | - 🎉 MapCoder got accepted in [ACL 2024](https://aclanthology.org/2024.acl-long.269/). 23 | - 📢 All our codebase is open-sourced with MIT License. 24 | 25 | ## Abstract 26 | 27 | Large Language Models (LLMs) have made significant strides in code generation and problem solving. Current approaches employ external tool-based iterative debuggers that use compiler or other tool-based runtime feedback to refine coarse programs generated by various methods. However, the effectiveness of these approaches heavily relies on the quality of the initial code generation, which remains an open challenge. In this paper, we introduce CodeSIM, a novel multi-agent code generation framework that comprehensively addresses the stages of program synthesis—planning, coding, and debugging—through a human-like perception approach. As human verifies their understanding of any algorithms through visual simulation, CodeSIM uniquely features a method of plan verification and internal debugging through the step-by-step simulation of input/output. Extensive experiments across seven challenging competitive problem-solving and program synthesis benchmarks demonstrate CODESIM’s remarkable code generation capabilities. Our framework achieves new state-of-the-art 28 | (pass@1) results—**(HumanEval 95.1%, MBPP 90.7%, APPS 22%, and CodeContests 29.1%)**. Furthermore, our method shows potential for even greater enhancement when cascaded with external debuggers. 29 | 30 | ## CodeSim Overview 31 | 32 | ![CodeSim Overview](./images/CodeSim-Overview.png) 33 | Our goal is to develop a multi-agent code generation approach capable of complex problem solving. Drawing inspiration from recent works like [MapCoder](https://aclanthology.org/2024.acl-long.269/), we devise the agents in CodeSIM for planning, coding, and debugging. While these existing approaches focus primarily on expanding steps without verifying underlying hypotheses, we address this limitation by introducing a novel verification approach. Our approach simulates input/output step-by-step, verifying generated plans and performing internal debugging, mirroring how humans understand, visualize, and refine in algorithm development. Below, we present our proposed model. 34 | 35 | ### » Planning Agent 36 | 37 | The first component of CodeSIM is the *Planning Agent*. Given a problem description, the *Planning Agent* generates a single exemplar—a relevant problem along with its plan and solution. This mimics the behavior of human programmers, who, when faced with a new problem, first recall a similar problem they've previously solved. This exemplar-based recall is crucial as it provides a starting point for constructing a solution plan. Instead of generating multiple ungrounded exemplars as in MapCoder, our agent focuses on only one at a time. We then instruct the LLM to generate an appropriate plan. Once the plan is created, the LLM simulates (step-by-step) the solution with a sample input. If the simulation result does not match the expected output, the agent prompts the LLM to revise the plan. Otherwise, the plan is deemed valid. In the case of failure, the *Planning Agent* refines the plan. 38 | 39 | ### » Coding Agent 40 | 41 | Next component is the *Coding Agent*, which takes the problem description and the plan generated by the *Planning Agent* as input. The role of this agent is to translate the plan into executable code that solves the given problem. Once the code is generated, CodeSIM evaluates it using sample input/output test cases. If the code passes all sample tests, it is returned as the final solution. Otherwise, the code is handed over to the next agent for further refinement. 42 | 43 | ### » Debugging Agent 44 | 45 | The final component, the *Debugging Agent*, receives the original problem, the plan from the *Planning Agent*, the code generated by the *Coding Agent*, and the execution (unit testing) log as input to debug the code. To identify bugs, instead of directly prompting the LLMs, we uniquely leverage the simulation once again. The LLM is instructed specifically to simulate the code on inputs where it fails to produce the expected output, allowing it to trace the execution step by step and locate the error. Once the bug is identified, the LLM modifies the code to resolve the issue. 46 | 47 | ## Results of CodeSim on Seven Benchmarks 48 | 49 | 52 | 53 | 54 | 55 | Contest Results 56 | 57 | Contest Results 58 | 59 | Contest Results 60 | 61 | ## Running our project 62 | 63 | 1. Clone our project 64 | 65 | ``` 66 | git clone https://github.com/kagnlp/CodeGenerator && cd CodeGenerator 67 | ``` 68 | 69 | 2. Create a new conda or python virtual environment and run the following command 70 | 71 | ``` 72 | pip install -r requirements.txt 73 | ``` 74 | 75 | 3. Set up the .env file by seeing the example. 76 | 4. Run the following command to see the options of running this projects 77 | 78 | ``` 79 | python src/main.py --help 80 | ``` 81 | 82 | 5. Finally run this project. An example is given below: 83 | 84 | ``` 85 | python src/main.py --dataset HumanEval --strategy CodeSIM --model_provider openai --model o3-mini 86 | ``` 87 | 88 | 6. To run this projects with competitive datasets you need to setup the [ExecEval](https://github.com/ntunlp/ExecEval) for docker execution. Please visit this [link](https://github.com/ntunlp/ExecEval) to setup a docker container and run it using 5000 port. Change the line 50 of the file `src\evaluations\api_comm.py` for different setup. 89 | 90 | ## Citation 91 | 92 | ``` 93 | @misc{islam2025codesim, 94 | title={CODESIM: Multi-Agent Code Generation and Problem Solving through 95 | Simulation-Driven Planning and Debugging}, 96 | author={Md. Ashraful Islam and Mohammed Eunus Ali and Md Rizwan Parvez}, 97 | year={2025}, 98 | eprint={2502.05664}, 99 | archivePrefix={arXiv}, 100 | primaryClass={cs.CL}, 101 | url={https://arxiv.org/abs/2502.05664}, 102 | } 103 | 104 | @article{islam2024mapcoder, 105 | title={MapCoder: Multi-Agent Code Generation for Competitive Problem Solving}, 106 | author={Islam, Md Ashraful and Ali, Mohammed Eunus and Parvez, Md Rizwan}, 107 | journal={arXiv preprint arXiv:2405.11403}, 108 | year={2024} 109 | } 110 | 111 | @misc{parvez2021RAGCodeGen, 112 | title={Retrieval Augmented Code Generation and Summarization}, 113 | author={Md Rizwan Parvez and Wasi Uddin Ahmad and Saikat Chakraborty and Baishakhi Ray and Kai-Wei Chang}, 114 | year={2021}, 115 | eprint={2108.11601}, 116 | archivePrefix={arXiv}, 117 | primaryClass={cs.SE}, 118 | url={https://arxiv.org/abs/2108.11601}, 119 | } 120 | @misc{khan2023xcodeeval, 121 | title={xCodeEval: A Large Scale Multilingual Multitask Benchmark for Code Understanding, Generation, Translation and Retrieval}, 122 | author={Mohammad Abdullah Matin Khan and M Saiful Bari and Xuan Long Do and Weishi Wang and Md Rizwan Parvez and Shafiq Joty}, 123 | year={2023}, 124 | eprint={2303.03004}, 125 | archivePrefix={arXiv}, 126 | primaryClass={cs.CL}, 127 | url={https://arxiv.org/abs/2303.03004}, 128 | } 129 | 130 | ``` 131 | -------------------------------------------------------------------------------- /images/CodeSim-Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/images/CodeSim-Overview.png -------------------------------------------------------------------------------- /images/basic-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/images/basic-results.png -------------------------------------------------------------------------------- /images/contest-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/images/contest-results.png -------------------------------------------------------------------------------- /images/opensource-llm-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/images/opensource-llm-results.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # python==3.11 2 | openai 3 | numpy 4 | pandas 5 | python-dotenv 6 | pyyml 7 | tenacity 8 | tiktoken 9 | tqdm 10 | gensim 11 | jsonlines 12 | astunparse 13 | pyarrow 14 | google-generativeai 15 | accelerate 16 | groq 17 | matplotlib 18 | -------------------------------------------------------------------------------- /src/constants/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/src/constants/__init__.py -------------------------------------------------------------------------------- /src/constants/lang_mappings.py: -------------------------------------------------------------------------------- 1 | LANGUAGE_MAPPING = { 2 | "Python": "Python 3", 3 | "Python3": "Python 3", 4 | "C#": "C# 10", 5 | "NET-CORE": ".NET Core C#", 6 | # "Node": "Node.js", 7 | "Rust": "Rust", 8 | # "Java":"Java 17", 9 | "PHP": "PHP", 10 | "Go": "Go", 11 | "Ruby": "Ruby", 12 | "C++": "GNU C++17", 13 | "C": "GNU C" 14 | } 15 | -------------------------------------------------------------------------------- /src/constants/paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join, dirname 3 | 4 | # HumanEval Dataset 5 | HUMAN_DATA_DIR = join( 6 | "data", 7 | "HumanEval", 8 | ) 9 | 10 | HUMAN_DATA_PATH = join( 11 | HUMAN_DATA_DIR, 12 | "HumanEval.jsonl" 13 | ) 14 | 15 | HUMAN_WST_DATA_PATH = join( 16 | HUMAN_DATA_DIR, 17 | "HumanEvalWST.jsonl" 18 | ) 19 | 20 | HUMAN_REFLXION_FILTERED_PATH = join( 21 | HUMAN_DATA_DIR, 22 | "humaneval-py.jsonl" 23 | ) 24 | 25 | HUMAN_HARDSET_PATH = join( 26 | HUMAN_DATA_DIR, 27 | "humaneval-py_hardest50.jsonl" 28 | ) 29 | 30 | HUMAN_ET_DATA_PATH = join( 31 | HUMAN_DATA_DIR, 32 | "HumanEvalET.jsonl" 33 | ) 34 | 35 | HUMAN_SIMILAR_PROBLEMS_PATH = join( 36 | HUMAN_DATA_DIR, 37 | "similar_problems_solutions.jsonl" 38 | ) 39 | 40 | 41 | # MBPP Dataset 42 | MBPP_DATA_DIR = join( 43 | "data", 44 | "MBPPEval", 45 | ) 46 | 47 | MBPP_DATA_PATH = join( 48 | MBPP_DATA_DIR, 49 | "mbpp-py.jsonl" 50 | ) 51 | 52 | MBPP_ET_DATA_PATH = join( 53 | MBPP_DATA_DIR, 54 | "MBPP_ET.jsonl" 55 | ) 56 | 57 | MBPP_SANITIZED_DATA_PATH = join( 58 | MBPP_DATA_DIR, 59 | "MBPP_SANITIZED.json" 60 | ) 61 | 62 | MBPP_SIMILAR_PROBLEMS_PATH = join( 63 | MBPP_DATA_DIR, 64 | "similar_problems_solutions.jsonl" 65 | ) 66 | 67 | # XCodeEval Dataset 68 | XCODE_DATA_DIR = join( 69 | "data", 70 | "xCodeEval", 71 | ) 72 | 73 | XCODE_VALIDATION_DATA_PATH = join( 74 | XCODE_DATA_DIR, 75 | "prog_syn_val.jsonl" 76 | ) 77 | 78 | XCODE_TEST_DATA_PATH = join( 79 | XCODE_DATA_DIR, 80 | "prog_syn_test.jsonl" 81 | ) 82 | 83 | XCODE_TRAIN_DATA_DIR_PATH = join( 84 | XCODE_DATA_DIR, 85 | "train" 86 | ) 87 | 88 | XCODE_UNIT_TEST_PATH = join( 89 | XCODE_DATA_DIR, 90 | "unittest_db.json" 91 | ) 92 | 93 | XCODE_PROBLEM_DESCRIPTION_PATH = join( 94 | XCODE_DATA_DIR, 95 | "problem_descriptions.jsonl" 96 | ) 97 | 98 | XCODE_SIMILAR_SRC_UIDS_PATH = join( 99 | XCODE_DATA_DIR, 100 | "similar_src_uids.json" 101 | ) 102 | 103 | XCODE_SIMILAR_PROBLEMS_PATH = join( 104 | XCODE_DATA_DIR, 105 | "similar_problems_solutions.json" 106 | ) 107 | 108 | XCODE_PROBLEM_FILE_MAPPINGS_PATH = join( 109 | XCODE_DATA_DIR, 110 | "problem_file_mapping.json" 111 | ) 112 | 113 | 114 | # Code Contest Dataset 115 | CODE_CONTEST_DATA_DIR = join( 116 | "data", 117 | "CodeContest", 118 | ) 119 | 120 | CODE_CONTEST_DATA_PATH = join( 121 | CODE_CONTEST_DATA_DIR, 122 | "Test.jsonl" 123 | ) 124 | 125 | 126 | # APPS Dataset 127 | APPS_DATA_DIR = join( 128 | "data", 129 | "APPS", 130 | ) 131 | 132 | APPS_DATA_PATH = join( 133 | APPS_DATA_DIR, 134 | "selected150.jsonl" 135 | ) 136 | -------------------------------------------------------------------------------- /src/constants/verboseType.py: -------------------------------------------------------------------------------- 1 | VERBOSE_FULL = 2 2 | VERBOSE_MINIMAL = 1 3 | VERBOSE_NONE = 0 4 | -------------------------------------------------------------------------------- /src/datasets/APPSDataset.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .Dataset import Dataset 3 | from evaluations.evalute import contest_evaluate, contest_evaluate_public_tests 4 | from constants.paths import * 5 | 6 | 7 | class APPSDataset(Dataset): 8 | def __init__( 9 | self, 10 | path: str = APPS_DATA_PATH, 11 | ): 12 | super().__init__(path) 13 | self.id_key = "id" 14 | 15 | def evaluate( 16 | self, 17 | item: dict, 18 | cur_imp: str, 19 | language: str, 20 | ): 21 | return contest_evaluate( 22 | generated_code=cur_imp, 23 | id=item["id"], 24 | tests=item["test_list"], 25 | lang=language 26 | ) 27 | 28 | def evaluate_sample_io( 29 | self, 30 | item: dict, 31 | cur_imp: str, 32 | language: str, 33 | ): 34 | if len(item["sample_io"]) == 0: 35 | return True, "" 36 | return contest_evaluate_public_tests( 37 | generated_code=cur_imp, 38 | id=item["id"], 39 | tests=item["sample_io"], 40 | lang=language 41 | ) 42 | 43 | def evaluate_additional_io( 44 | self, 45 | id: int, 46 | tests: List[str], 47 | cur_imp: str, 48 | language: str, 49 | ): 50 | 51 | if tests == []: 52 | return True, '' 53 | 54 | return contest_evaluate_public_tests( 55 | generated_code=cur_imp, 56 | id=id, 57 | tests=tests, 58 | lang=language 59 | ) 60 | 61 | @staticmethod 62 | def get_prompt(item): 63 | sample_io_format = "" 64 | if len(item['sample_io']) > 0: 65 | sample_io_format = f"Sample Input Format:\n{item['sample_io'][0]['input']}\nSample Output Format:\n{item['sample_io'][0]['output'][0]}\n\n-------\n" 66 | 67 | return f"{item['description']}\n\n{sample_io_format}\n-------\nImportant Note: You must follow the input output format. Input should be taken from standard input and output should be given to standard output.\nNote: If you are writing a function then after the function definition take input from using `input()` function, call the function with specified parameters and finally print the output of the function." 68 | -------------------------------------------------------------------------------- /src/datasets/CodeContestDataset.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .Dataset import Dataset 3 | from evaluations.evalute import contest_evaluate, contest_evaluate_public_tests 4 | from constants.paths import * 5 | 6 | class CodeContestDataset(Dataset): 7 | def __init__( 8 | self, 9 | path: str=CODE_CONTEST_DATA_PATH, 10 | ): 11 | super().__init__(path) 12 | self.id_key = "id" 13 | 14 | def evaluate( 15 | self, 16 | item: dict, 17 | cur_imp: str, 18 | language: str, 19 | ): 20 | return contest_evaluate( 21 | generated_code=cur_imp, 22 | id=item["id"], 23 | tests=item["test_list"], 24 | lang=language 25 | ) 26 | 27 | def evaluate_sample_io( 28 | self, 29 | item: dict, 30 | cur_imp: str, 31 | language: str, 32 | ): 33 | return contest_evaluate_public_tests( 34 | generated_code=cur_imp, 35 | id=item["id"], 36 | tests=item["sample_io"], 37 | lang=language 38 | ) 39 | 40 | def evaluate_additional_io( 41 | self, 42 | id: int, 43 | tests: List[str], 44 | cur_imp: str, 45 | language: str, 46 | ): 47 | 48 | if tests == []: 49 | return True, '' 50 | 51 | return contest_evaluate_public_tests( 52 | generated_code=cur_imp, 53 | id=id, 54 | tests=tests, 55 | lang=language 56 | ) 57 | 58 | @staticmethod 59 | def get_prompt(item): 60 | return f"{item['description']}\n\n-------\nImportant Note: You must follow the input output format. Input must be taken from standard input and output must be given to standard output. The code will be tested against multiple test cases and all the test cases must be passed." 61 | -------------------------------------------------------------------------------- /src/datasets/Dataset.py: -------------------------------------------------------------------------------- 1 | from utils.jsonl import read_jsonl 2 | 3 | 4 | class Dataset(object): 5 | def __init__( 6 | self, 7 | path: str, 8 | ): 9 | self.path = path 10 | self.data = None 11 | self.id_key = "" 12 | self.load() 13 | 14 | def load(self): 15 | self.data = read_jsonl(self.path) 16 | 17 | def __len__(self): 18 | return len(self.data) 19 | 20 | def __getitem__(self, idx): 21 | return self.data[idx] 22 | 23 | def evaluate( 24 | self, 25 | item: dict, 26 | cur_imp: str, 27 | language: str, 28 | ): 29 | raise NotImplementedError 30 | 31 | @staticmethod 32 | def get_prompt(item): 33 | raise NotImplementedError 34 | -------------------------------------------------------------------------------- /src/datasets/DatasetFactory.py: -------------------------------------------------------------------------------- 1 | from datasets.Dataset import Dataset 2 | from datasets.MBPPDataset import MBPPDataset 3 | from datasets.APPSDataset import APPSDataset 4 | from datasets.XCodeDataset import XCodeDataset 5 | from datasets.HumanEvalDataset import HumanDataset 6 | from datasets.CodeContestDataset import CodeContestDataset 7 | 8 | 9 | class DatasetFactory: 10 | @staticmethod 11 | def get_dataset_class(dataset_name): 12 | dataset_name = dataset_name.lower() 13 | if dataset_name == "apps": 14 | return APPSDataset 15 | elif dataset_name == "mbpp": 16 | return MBPPDataset 17 | elif dataset_name == "xcode": 18 | return XCodeDataset 19 | elif dataset_name == "xcodeeval": 20 | return XCodeDataset 21 | elif dataset_name == "humaneval": 22 | return HumanDataset 23 | elif dataset_name == "human": 24 | return HumanDataset 25 | elif dataset_name == "cc": 26 | return CodeContestDataset 27 | else: 28 | raise Exception(f"Unknown dataset name {dataset_name}") 29 | -------------------------------------------------------------------------------- /src/datasets/HumanEvalDataset.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .Dataset import Dataset 3 | from evaluations.func_evaluate import evaluate_functional_correctness, evaluate_io 4 | from constants.paths import * 5 | 6 | 7 | class HumanDataset(Dataset): 8 | def __init__( 9 | self, 10 | path: str = HUMAN_DATA_PATH, 11 | ): 12 | super().__init__(path) 13 | self.id_key = "task_id" 14 | 15 | def evaluate( 16 | self, 17 | item: dict, 18 | cur_imp: str, 19 | language: str, 20 | ): 21 | result = evaluate_functional_correctness( 22 | test=item["test"], 23 | entry_point=item["entry_point"], 24 | completion=cur_imp, 25 | ) 26 | return result == "passed" 27 | 28 | def evaluate_sample_io( 29 | self, 30 | item: dict, 31 | cur_imp: str, 32 | language: str, 33 | ): 34 | 35 | return evaluate_io( 36 | sample_io=item["sample_io"], 37 | completion=cur_imp, 38 | ) 39 | 40 | 41 | def evaluate_additional_io( 42 | self, 43 | id: int, 44 | io: List[str], 45 | cur_imp: str, 46 | language: str, 47 | ): 48 | if len(io) == 0: 49 | return True, "" 50 | 51 | return evaluate_io( 52 | sample_io=io, 53 | completion=cur_imp, 54 | ) 55 | 56 | @staticmethod 57 | def get_prompt(item): 58 | if "prompt" in item: 59 | return f"{item['prompt'].strip()}" 60 | elif "text" in item: 61 | return f"{item['text'].strip()}" 62 | else: 63 | raise Exception("No prompt or text in item") 64 | -------------------------------------------------------------------------------- /src/datasets/MBPPDataset.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .Dataset import Dataset 3 | from evaluations.func_evaluate import evaluate_io, evaluate_functional_correctness 4 | from constants.paths import * 5 | 6 | 7 | class MBPPDataset(Dataset): 8 | def __init__( 9 | self, 10 | path: str = MBPP_DATA_PATH, 11 | ): 12 | super().__init__(path) 13 | self.id_key = "name" 14 | 15 | def evaluate( 16 | self, 17 | item: dict, 18 | cur_imp: str, 19 | language: str, 20 | ): 21 | # result, _ = evaluate_io(item['test_list'],cur_imp,5,True) 22 | # return result 23 | result = evaluate_functional_correctness( 24 | test=item["test"], 25 | entry_point=item["entry_point"], 26 | completion=cur_imp, 27 | ) 28 | return result == "passed" 29 | 30 | def evaluate_sample_io( 31 | self, 32 | item: dict, 33 | cur_imp: str, 34 | language: str, 35 | ): 36 | if "sample_io" not in item: 37 | return True, "" 38 | if len(item["sample_io"]) == 0: 39 | return True, "" 40 | return evaluate_io( 41 | sample_io=item["sample_io"], 42 | completion=cur_imp, 43 | ) 44 | 45 | def evaluate_additional_io( 46 | self, 47 | id: int, 48 | io: List[str], 49 | cur_imp: str, 50 | language: str, 51 | ): 52 | if len(io) == 0: 53 | return True, "" 54 | 55 | return evaluate_io( 56 | sample_io=io, 57 | completion=cur_imp, 58 | ) 59 | 60 | 61 | @staticmethod 62 | def get_prompt(item): 63 | # function_signature = item['code'].split('\n')[0].strip() 64 | # return f"{item['text']}\nFunction Signature: {function_signature}" 65 | return item["prompt"].strip() 66 | -------------------------------------------------------------------------------- /src/datasets/XCodeDataset.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .Dataset import Dataset 3 | from evaluations.evalute import xcode_evaluate, contest_evaluate_public_tests 4 | from constants.paths import * 5 | 6 | 7 | class XCodeDataset(Dataset): 8 | def __init__( 9 | self, 10 | path: str = XCODE_VALIDATION_DATA_PATH, 11 | ): 12 | super().__init__(path) 13 | self.id_key = "src_uid" 14 | 15 | def evaluate_sample_io( 16 | self, 17 | item: dict, 18 | cur_imp: str, 19 | language: str, 20 | ): 21 | sample_io = [] 22 | 23 | for input, output in zip(item["sample_inputs"], item["sample_outputs"]): 24 | sample_io.append({ 25 | "input": input, 26 | "output": [output] 27 | }) 28 | 29 | return contest_evaluate_public_tests( 30 | generated_code=cur_imp, 31 | id=item[self.id_key], 32 | tests=sample_io, 33 | lang=language 34 | ) 35 | 36 | 37 | def evaluate( 38 | self, 39 | item: dict, 40 | cur_imp: str, 41 | language: str, 42 | ): 43 | return xcode_evaluate( 44 | generated_code=cur_imp, 45 | src_uid=item["src_uid"], 46 | lang=language 47 | ) 48 | 49 | def evaluate_additional_io( 50 | self, 51 | id: int, 52 | tests: List[str], 53 | cur_imp: str, 54 | language: str, 55 | ): 56 | 57 | if tests == []: 58 | return True, '' 59 | 60 | return contest_evaluate_public_tests( 61 | generated_code=cur_imp, 62 | id=id, 63 | tests=tests, 64 | lang=language 65 | ) 66 | 67 | @staticmethod 68 | def get_prompt(item): 69 | return f"Problem Description:\n{item['description']}\nInput Specification:\n{item['input_spec']}\nOutput Specification:\n{item['output_spec']}\nSample Inputs: {item['sample_inputs']}\nSample Outputs: {item['sample_outputs']}\n\n-------\nImportant Note: If you are writing a function then after the function definition take input from using `input()` function, call the function with specified parameters and finally print the output of the function.\nNote: {item['notes']}\nTake input from: {item['input_from']}\nGive output to: {item['output_to']}" 70 | -------------------------------------------------------------------------------- /src/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/src/datasets/__init__.py -------------------------------------------------------------------------------- /src/datasets/convert-apps-xcode.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | 5 | def read_jsonl(filename): 6 | """Reads a jsonl file and yields each line as a dictionary""" 7 | lines = [] 8 | # i = 0 9 | with open(filename, "r", encoding="utf-8") as file: 10 | for line in file: 11 | lines.append(json.loads(line)) 12 | # i += 1 13 | # print(i) 14 | return lines 15 | 16 | # Write a python list of dictionaries into a jsonl file 17 | 18 | 19 | def write_jsonl(filename, lines): 20 | """Writes a python list of dictionaries into a jsonl file""" 21 | with open(filename, "w", encoding="utf-8") as file: 22 | for line in lines: 23 | file.write(json.dumps(line) + "\n") 24 | 25 | 26 | train_set = read_jsonl("./data/APPS/train.jsonl") 27 | test_set = read_jsonl("./data/APPS/train.jsonl") 28 | 29 | dataset = train_set + test_set 30 | 31 | print(len(dataset)) 32 | 33 | dataset = pd.DataFrame(dataset) 34 | # dataset.columns 35 | 36 | print(dataset['difficulty'].unique()) 37 | 38 | 39 | # Filter problems from codeforces with atleast 10 input and output 40 | filter_indices = [False] * len(dataset) 41 | for i in range(len(dataset)): 42 | row = dataset.iloc[i] 43 | if "codeforces" in row['url'] and row['input_output'] and len(json.loads(row['input_output'])["inputs"]) > 5: 44 | filter_indices[i] = True 45 | 46 | codeforces_dataset = dataset[filter_indices] 47 | 48 | print(len(codeforces_dataset)) 49 | 50 | # Randomly choose 50 problems 51 | codeforces_dataset_50 = codeforces_dataset.sample(n=min(50, len(codeforces_dataset)), random_state=1, replace=False) 52 | print(len(codeforces_dataset_50)) 53 | 54 | codeforces_dataset_50.reset_index(drop=True, inplace=True) 55 | 56 | # Filter interview problems with atleast 10 input and output 57 | filter_indices = [False] * len(dataset) 58 | for i in range(len(dataset)): 59 | row = dataset.iloc[i] 60 | if "interview" == row['difficulty'] and row['input_output'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5: 61 | filter_indices[i] = True 62 | 63 | interview_dataset = dataset[filter_indices] 64 | 65 | print(len(interview_dataset)) 66 | 67 | # Randomly choose 50 problems 68 | interview_dataset_50 = interview_dataset.sample( 69 | n=min(50, len(interview_dataset)), random_state=1, replace=False) 70 | print(len(interview_dataset_50)) 71 | 72 | interview_dataset_50.reset_index(drop=True, inplace=True) 73 | 74 | 75 | # Filter introductory problems with atleast 10 input and output 76 | filter_indices = [False] * len(dataset) 77 | for i in range(len(dataset)): 78 | row = dataset.iloc[i] 79 | if "introductory" == row['difficulty'] and len(row['input_output']) < 2000 and len(json.loads(row['input_output'])["inputs"]) > 5: 80 | filter_indices[i] = True 81 | 82 | introductory_dataset = dataset[filter_indices] 83 | 84 | print(len(introductory_dataset)) 85 | 86 | # Randomly choose 50 problems 87 | introductory_dataset_50 = introductory_dataset.sample( 88 | n=min(50, len(introductory_dataset)), random_state=1, replace=False) 89 | print(len(introductory_dataset_50)) 90 | 91 | introductory_dataset_50.reset_index(drop=True, inplace=True) 92 | 93 | selected_df = pd.concat([introductory_dataset_50, interview_dataset_50, codeforces_dataset_50], ignore_index=True) 94 | 95 | 96 | def get_test_cases(input, output): 97 | return { 98 | "input": "\n".join([str(x) for x in input]) if type(input) == list else input, 99 | "output": output if type(output) == list else [output] 100 | } 101 | 102 | 103 | selected_datasets = [] 104 | 105 | for i in range(len(selected_df)): 106 | row = selected_df.iloc[i] 107 | test_cases = json.loads(row['input_output']) 108 | 109 | public_test_cases = list( 110 | map(get_test_cases, test_cases['inputs'][0:2], test_cases['outputs'][0:2])) 111 | test_cases = list( 112 | map(get_test_cases, test_cases['inputs'], test_cases['outputs'])) 113 | 114 | test = { 115 | "name": str(row['id']), 116 | "description": str(row['question']), 117 | "difficulty": str(row['difficulty']), 118 | "id": int(row['id']), 119 | "sample_io": public_test_cases, 120 | "test_list": test_cases, 121 | "starter_code": str(row['starter_code']), 122 | } 123 | 124 | selected_datasets.append(test) 125 | 126 | 127 | write_jsonl("./data/APPS/selected150.jsonl", selected_datasets) 128 | 129 | 130 | -------------------------------------------------------------------------------- /src/datasets/convert-cc-xcode.py: -------------------------------------------------------------------------------- 1 | # Using this python file we have converted the code contest dataset to the format of the xCodeEval dataset. 2 | 3 | import pandas as pd 4 | import json 5 | 6 | 7 | def read_jsonl(filename): 8 | """Reads a jsonl file and yields each line as a dictionary""" 9 | lines = [] 10 | # i = 0 11 | with open(filename, "r", encoding="utf-8") as file: 12 | for line in file: 13 | lines.append(json.loads(line)) 14 | # i += 1 15 | # print(i) 16 | return lines 17 | 18 | # Write a python list of dictionaries into a jsonl file 19 | 20 | 21 | def write_jsonl(filename, lines): 22 | """Writes a python list of dictionaries into a jsonl file""" 23 | with open(filename, "w", encoding="utf-8") as file: 24 | for line in lines: 25 | file.write(json.dumps(line) + "\n") 26 | 27 | 28 | df = pd.read_parquet("./data/CodeContest/validation.parquet", engine='pyarrow') 29 | df = df[['name', 'cf_contest_id', 'cf_tags', 'difficulty', 30 | 'description', 'public_tests', 'private_tests', 'generated_tests']] 31 | 32 | 33 | def get_test_cases(input, output): 34 | return { 35 | "input": str(input), 36 | "output": [str(output)] 37 | } 38 | 39 | 40 | test_datasets = [] 41 | 42 | for i in range(len(df)): 43 | row = df.iloc[i] 44 | 45 | public_test_cases = list( 46 | map(get_test_cases, row['public_tests']['input'], row['public_tests']['output'])) 47 | test_cases = [] 48 | test_cases.extend(list(map( 49 | get_test_cases, row['private_tests']['input'], row['private_tests']['output']))) 50 | test_cases.extend(list(map( 51 | get_test_cases, row['generated_tests']['input'], row['generated_tests']['output']))) 52 | 53 | test = { 54 | "name": str(row['name']), 55 | "description": str(row['description']), 56 | "tags": list(row['cf_tags']), 57 | "difficulty": int(row['difficulty']), 58 | "id": int(row['cf_contest_id']), 59 | "sample_io": public_test_cases, 60 | "test_list": test_cases 61 | } 62 | 63 | test_datasets.append(test) 64 | 65 | 66 | write_jsonl("./data/CodeContest/Val.jsonl", test_datasets) 67 | -------------------------------------------------------------------------------- /src/evaluations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/src/evaluations/__init__.py -------------------------------------------------------------------------------- /src/evaluations/api_comm.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | import requests 4 | from .exec_outcome import ExecOutcome 5 | 6 | @dataclass 7 | class ExtendedUnittest: 8 | input: str 9 | output: list[str] = field(default_factory=list) 10 | result: str | None = None 11 | exec_outcome: ExecOutcome | None = None 12 | 13 | def json(self): 14 | _json = self.__dict__ 15 | if self.exec_outcome is not None: 16 | _json["exec_outcome"] = self.exec_outcome.name 17 | 18 | return _json 19 | 20 | @classmethod 21 | def from_json(cls, _json): 22 | return cls( 23 | input=_json.get("input", ""), 24 | output=_json.get("output", list()), 25 | result=_json.get("result", None), 26 | exec_outcome=_json.get("exec_outcome", None), 27 | ) 28 | 29 | 30 | class EmptyValueError(Exception): 31 | def __init__(self, *args, **kwargs): 32 | super().__init__(*args, **kwargs) 33 | 34 | 35 | class EmptyUnittestError(EmptyValueError): 36 | pass 37 | 38 | 39 | class EmptyLanguageError(EmptyValueError): 40 | pass 41 | 42 | 43 | class EmptySourceCodeError(EmptyValueError): 44 | pass 45 | 46 | 47 | class APICommunication: 48 | _session: requests.Session 49 | 50 | def __init__(self, server_url: str = "http://localhost:5000"): 51 | self._session = requests.Session() 52 | self.execute_code_url = f"{server_url}/api/execute_code" 53 | self.get_runtimes_url = f"{server_url}/api/all_runtimes" 54 | 55 | def __enter__(self): 56 | return self 57 | 58 | def __exit__(self, *args): 59 | self._session.close() 60 | 61 | def get_runtimes(self): 62 | return self._session.get(self.get_runtimes_url).json() 63 | 64 | def execute_code( 65 | self, 66 | language: str, 67 | source_code: str, 68 | unittests: list[dict], 69 | limits: dict | None, 70 | block_network: bool = True, 71 | stop_on_first_fail: bool = True, 72 | use_sanitizer: bool = False, 73 | compiler_program_name: str | None = None, 74 | compiler_flags: str | None = None, 75 | interpreter_cmd: str | None = None, 76 | interpreter_flags: str | None = None, 77 | sample_id: int | None = None, 78 | task_id: str | int | None = None, 79 | ) -> tuple[list[ExtendedUnittest], int | None, str | int | None]: 80 | if language is None: 81 | raise EmptyLanguageError 82 | 83 | if source_code is None: 84 | raise EmptySourceCodeError 85 | 86 | if unittests is None or len(unittests) == 0: 87 | raise EmptyUnittestError 88 | 89 | request_body = dict( 90 | language=language, 91 | source_code=source_code, 92 | unittests=unittests, 93 | limits=limits if isinstance(limits, dict) else dict(), 94 | compile_cmd=compiler_program_name, 95 | compile_flags=compiler_flags, 96 | execute_cmd=interpreter_cmd, 97 | execute_flags=interpreter_flags, 98 | block_network=block_network, 99 | stop_on_first_fail=stop_on_first_fail, 100 | use_sanitizer=use_sanitizer, 101 | ) 102 | json_response = self._session.post( 103 | self.execute_code_url, 104 | json=request_body, 105 | headers={"Content-Type": "application/json"}, 106 | ).json() 107 | 108 | if "error" in json_response: 109 | return "error", json_response["error"], task_id 110 | if "data" not in json_response: 111 | return "error", str(json_response), task_id 112 | 113 | return ( 114 | json_response["data"], 115 | None, 116 | task_id, 117 | ) 118 | -------------------------------------------------------------------------------- /src/evaluations/evalute.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | import tqdm 5 | from yaml import safe_load 6 | from typing import List 7 | 8 | from .api_comm import APICommunication 9 | from .exec_outcome import ExecOutcome 10 | from constants.lang_mappings import LANGUAGE_MAPPING 11 | 12 | limits_by_lang_cfg_file = "./src/evaluations/limits_by_lang.yaml" 13 | 14 | assert os.path.exists( 15 | limits_by_lang_cfg_file), "Need resource limit defaults for all runtimes, provide the path to default 'limits_by_lang.yaml' or to the modified one." 16 | 17 | with open(limits_by_lang_cfg_file) as limit_cfg_rp: 18 | limits_by_lang = safe_load(limit_cfg_rp) 19 | 20 | unittest_file = "./data/xCodeEval/unittest_db.json" 21 | assert os.path.exists(unittest_file), "Unittest file not found." 22 | 23 | with open(unittest_file) as ut_rp: 24 | unittest_db = json.load(ut_rp) 25 | 26 | 27 | api_comm = APICommunication() 28 | 29 | 30 | def xcode_evaluate( 31 | generated_code: str, 32 | src_uid: str, 33 | lang: str 34 | ): 35 | 36 | assert src_uid in unittest_db, "Can not find the task id or source id" 37 | 38 | assert lang in LANGUAGE_MAPPING, f"language must be inside the supported language list: {LANGUAGE_MAPPING.keys()}" 39 | 40 | results, _, _ = api_comm.execute_code( 41 | language=LANGUAGE_MAPPING[lang], 42 | source_code=generated_code, 43 | unittests=unittest_db[src_uid], 44 | limits=limits_by_lang[LANGUAGE_MAPPING[lang]], 45 | task_id=src_uid, 46 | ) 47 | 48 | if results == "error": 49 | return False 50 | 51 | passed = True 52 | for result in results: 53 | if result['exec_outcome'] != ExecOutcome.PASSED.value: 54 | passed = False 55 | break 56 | 57 | return passed 58 | 59 | 60 | def xcode_execute_internal_test( 61 | generated_code: str, 62 | tests: List[dict], 63 | src_uid: str, 64 | lang: str 65 | ): 66 | results, _, _ = api_comm.execute_code( 67 | language=LANGUAGE_MAPPING[lang], 68 | source_code=generated_code, 69 | unittests=tests, 70 | limits=limits_by_lang[LANGUAGE_MAPPING[lang]], 71 | task_id=src_uid, 72 | stop_on_first_fail=False 73 | ) 74 | 75 | passed = True 76 | passed_feedback = [] 77 | failed_feedback = [] 78 | 79 | idx = 0 80 | try: 81 | for idx, result in enumerate(results): 82 | if result['exec_outcome'] == ExecOutcome.PASSED.value: 83 | passed_feedback.append(tests[idx]) 84 | if result['exec_outcome'] != ExecOutcome.PASSED.value: 85 | failed_feedback.append(tests[idx]) 86 | passed = False 87 | except: 88 | passed = False 89 | failed_feedback.extend(tests[idx:]) 90 | 91 | feedback = f'Tested passed: \n{json.dumps(passed_feedback)}\n\nTests failed: \n{json.dumps(failed_feedback)}' 92 | 93 | return passed, feedback 94 | 95 | 96 | def contest_evaluate( 97 | generated_code: str, 98 | lang: str, 99 | id: int, 100 | tests: List[dict], 101 | ): 102 | assert lang in LANGUAGE_MAPPING, f"language must be inside the supported language list: {LANGUAGE_MAPPING.keys()}" 103 | 104 | results, _, _ = api_comm.execute_code( 105 | language=LANGUAGE_MAPPING[lang], 106 | source_code=generated_code, 107 | unittests=tests, 108 | limits=limits_by_lang[LANGUAGE_MAPPING[lang]], 109 | task_id=id, 110 | ) 111 | 112 | if results == "error": 113 | return False 114 | 115 | passed = True 116 | for result in results: 117 | if result['exec_outcome'] != ExecOutcome.PASSED.value: 118 | passed = False 119 | break 120 | 121 | return passed 122 | 123 | 124 | def contest_evaluate_public_tests( 125 | generated_code: str, 126 | lang: str, 127 | id: int, 128 | tests: List[dict], 129 | ): 130 | results, error, _ = api_comm.execute_code( 131 | language=LANGUAGE_MAPPING[lang], 132 | source_code=generated_code, 133 | unittests=tests, 134 | limits=limits_by_lang[LANGUAGE_MAPPING[lang]], 135 | task_id=id, 136 | stop_on_first_fail=False 137 | ) 138 | 139 | if error is not None: 140 | return False, f"## Tests failed:\nSyntax Error Message:{error}" 141 | 142 | passed = True 143 | passed_feedback = [] 144 | failed_feedback = [] 145 | 146 | idx = 0 147 | try: 148 | for idx, result in enumerate(results): 149 | output = str(result['result']) 150 | if len(output) > 500: 151 | output = output[:500] + "..." 152 | test_case = f"Input:\n{tests[idx]['input']}\nExpected Output:\n{tests[idx]['output'][0]}\nYour Output:\n{output}\n" 153 | if result['exec_outcome'] == ExecOutcome.PASSED.value: 154 | passed_feedback.append(test_case) 155 | if result['exec_outcome'] != ExecOutcome.PASSED.value: 156 | failed_feedback.append(test_case) 157 | passed = False 158 | except: 159 | passed = False 160 | test_cases = [] 161 | for i in range(idx, len(tests)): 162 | test_case = f"Input:\n{tests[i]['input']}\nExpected Output:\n{tests[i]['output'][0]}\n" 163 | test_cases.append(test_case) 164 | 165 | failed_feedback.extend(test_cases) 166 | 167 | passed_feedback = '\n'.join(passed_feedback) if len(passed_feedback) > 0 else "No test cases passed." 168 | failed_feedback = '\n'.join(failed_feedback) 169 | feedback = f'## Tested passed:\n{passed_feedback}\n\n## Tests failed:\n{failed_feedback}' 170 | 171 | return passed, feedback 172 | -------------------------------------------------------------------------------- /src/evaluations/exec_outcome.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ExecOutcome(Enum): 5 | PASSED = "PASSED" # code executes and output matches expected output 6 | WRONG_ANSWER = ( 7 | "WRONG_ANSWER" # code executes and output does NOT matches expected output 8 | ) 9 | TIME_LIMIT_EXCEEDED = "TIME_LIMIT_EXCEEDED" # code executes and didn't exit in time, output is ignored in this case 10 | RUNTIME_ERROR = "RUNTIME_ERROR" # code failed to execute (crashed) 11 | COMPILATION_ERROR = "COMPILATION_ERROR" # code failed to compile 12 | MEMORY_LIMIT_EXCEEDED = ( 13 | "MEMORY_LIMIT_EXCEEDED" # code exceeded memory limit during execution 14 | ) 15 | -------------------------------------------------------------------------------- /src/evaluations/executor_utils.py: -------------------------------------------------------------------------------- 1 | 2 | def timeout_handler(_, __): 3 | raise TimeoutError() 4 | 5 | import os, json 6 | def to_jsonl(dict_data, file_path): 7 | with open(file_path, 'a') as file: 8 | json_line = json.dumps(dict_data) 9 | file.write(json_line + os.linesep) 10 | 11 | from threading import Thread 12 | class PropagatingThread(Thread): 13 | def run(self): 14 | self.exc = None 15 | try: 16 | if hasattr(self, '_Thread__target'): 17 | # Thread uses name mangling prior to Python 3. 18 | self.ret = self._Thread__target(*self._Thread__args, **self._Thread__kwargs) 19 | else: 20 | self.ret = self._target(*self._args, **self._kwargs) 21 | except BaseException as e: 22 | self.exc = e 23 | 24 | def join(self, timeout=None): 25 | super(PropagatingThread, self).join(timeout) 26 | if self.exc: 27 | raise self.exc 28 | return self.ret 29 | 30 | 31 | def function_with_timeout(func, args, timeout): 32 | result_container = [] 33 | 34 | def wrapper(): 35 | result_container.append(func(*args)) 36 | 37 | thread = PropagatingThread(target=wrapper) 38 | thread.start() 39 | thread.join(timeout) 40 | 41 | if thread.is_alive(): 42 | raise TimeoutError() 43 | else: 44 | return result_container[0] 45 | 46 | # Py tests 47 | 48 | # if __name__ == "__main__": 49 | # formatter = PySubmissionFormatter() 50 | # leetcode_1 = 'class Solution:\n def solveSudoku(self, board: List[List[str]]) -> None:\n """\n Do not return anything, modify board in-place instead.\n """\n ' 51 | # humaneval_1 = 'def solveSudoku(self, board: List[List[str]]) -> None:\n """\n Do not return anything, modify board in-place instead.\n """\n' 52 | 53 | # assert leetcode_1 == formatter.to_leetcode(humaneval_1) 54 | # assert humaneval_1 == formatter.to_humaneval(leetcode_1) 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /src/evaluations/func_evaluate.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | import contextlib 3 | import signal 4 | 5 | from .executor_utils import function_with_timeout 6 | 7 | 8 | def evaluate_io( 9 | sample_io: list[str], 10 | completion: str, 11 | timeout: int = 5, 12 | stop_early: bool = False, 13 | ): 14 | if len(sample_io) == 0: 15 | return True, "" 16 | 17 | test_log = "" 18 | passed = True 19 | for io in sample_io: 20 | try: 21 | code = ("from typing import *\n" if "from typing import *" not in completion else "") + \ 22 | completion + "\n" + io + "\n" 23 | function_with_timeout( 24 | exec, 25 | (code, globals()), 26 | timeout 27 | ) 28 | test_log += f"Passed in test case: {io}\n" 29 | except Exception as e: 30 | if stop_early: 31 | return False, f"Failed in test case: {io}\n" 32 | passed = False 33 | test_log += f"Failed in test case: {io}\n" 34 | 35 | return passed, test_log 36 | 37 | 38 | def evaluate_io_et( 39 | sample_io: list[str], 40 | completion: str, 41 | timeout: int = 5, 42 | prompt: str = "", 43 | ): 44 | io = "\n".join(sample_io) 45 | try: 46 | code = ("from typing import *\n" if "from typing import *" not in completion else "") + \ 47 | prompt + completion + "\n" + io + "\n" 48 | function_with_timeout( 49 | exec, 50 | (code, globals()), 51 | timeout 52 | ) 53 | return True 54 | except Exception as e: 55 | return False 56 | 57 | 58 | def evaluate_functional_correctness( 59 | test: str, 60 | entry_point: str, 61 | completion: str, 62 | timeout: int = 5, 63 | ): 64 | try: 65 | code = ("from typing import *\n" if "from typing import *" not in completion else "") + \ 66 | completion + "\n" + test + \ 67 | "\n" + f"check({entry_point})" 68 | 69 | function_with_timeout( 70 | exec, 71 | (code, globals()), 72 | timeout 73 | ) 74 | return "passed" 75 | except Exception as e: 76 | return f"failed: {e}" 77 | 78 | 79 | class TimeoutException(Exception): 80 | pass 81 | -------------------------------------------------------------------------------- /src/evaluations/limits_by_lang.yaml: -------------------------------------------------------------------------------- 1 | GNU C: 2 | nofile: 0 3 | 4 | GNU C11: 5 | nofile: 0 6 | 7 | GNU C++: 8 | nofile: 0 9 | 10 | GNU C++0x: 11 | nofile: 0 12 | 13 | GNU C++11: 14 | nofile: 0 15 | 16 | GNU C++14: 17 | nofile: 0 18 | 19 | GNU C++17: 20 | nofile: 0 21 | 22 | GNU C++17 (64): 23 | nofile: 0 24 | 25 | GNU C++20 (64): 26 | nofile: 0 27 | 28 | GNU C++20: 29 | nofile: 0 30 | 31 | GNU C++17 Diagnostics: 32 | nofile: 0 33 | 34 | Clang++17 Diagnostics: 35 | nofile: 0 36 | 37 | Clang++17: 38 | nofile: 0 39 | 40 | Clang++20 Diagnostics: 41 | nofile: 0 42 | 43 | Clang++20: 44 | nofile: 0 45 | 46 | Clang++14: 47 | nofile: 0 48 | 49 | Clang++11: 50 | nofile: 0 51 | 52 | MS C++: 53 | nofile: 0 54 | 55 | MS C++ 2017: 56 | nofile: 0 57 | 58 | MS C#: 59 | nofile: 4 60 | nproc: 4 61 | fsize: 1073741824 62 | 63 | C# 10: 64 | nofile: 4 65 | nproc: 4 66 | fsize: 1073741824 67 | 68 | C# 8: 69 | nofile: 4 70 | nproc: 4 71 | fsize: 1073741824 72 | 73 | Mono C#: 74 | nofile: 4 75 | nproc: 4 76 | fsize: 1073741824 77 | 78 | .NET Core C#: 79 | nofile: 4 80 | nproc: 4 81 | fsize: 1073741824 82 | 83 | PyPy 2: 84 | nofile: 4 85 | 86 | Python 2: 87 | nofile: 4 88 | 89 | PyPy 3: 90 | nofile: 4 91 | 92 | PyPy 3-64: 93 | nofile: 4 94 | 95 | Python 3: 96 | nofile: 4 97 | 98 | Python 3 + libs: 99 | nofile: 4 100 | 101 | JavaScript: 102 | nofile: 4 103 | 104 | Node js: 105 | nofile: 4 106 | 107 | Node.js: 108 | nofile: 4 109 | 110 | Rust: 111 | nofile: 4 112 | 113 | Rust 2021: 114 | nofile: 4 115 | 116 | Rust 2018: 117 | nofile: 4 118 | 119 | Rust 2015: 120 | nofile: 4 121 | 122 | Java 6: 123 | nofile: 10 124 | _as: -1 125 | 126 | Java 7: 127 | nofile: 10 128 | _as: -1 129 | 130 | Java 1.5: 131 | nofile: 10 132 | _as: -1 133 | 134 | Java 8: 135 | nofile: 10 136 | _as: -1 137 | 138 | Java 11: 139 | nofile: 10 140 | _as: -1 141 | 142 | Java 17: 143 | nofile: 10 144 | _as: -1 145 | 146 | PHP: 147 | nofile: 4 148 | 149 | PHP 8.1: 150 | nofile: 4 151 | 152 | Go: 153 | nofile: 0 154 | nproc: 6 155 | 156 | Ruby: 157 | nofile: 10 158 | 159 | Ruby 3: 160 | nofile: 10 161 | 162 | Kotlin: 163 | nofile: 7 164 | nproc: 23 165 | _as: -1 166 | 167 | Kotlin 1.4: 168 | nofile: 7 169 | nproc: 23 170 | _as: -1 171 | 172 | Kotlin 1.5: 173 | nofile: 7 174 | nproc: 23 175 | _as: -1 176 | 177 | Kotlin 1.6: 178 | nofile: 7 179 | nproc: 23 180 | _as: -1 181 | 182 | Kotlin 1.7: 183 | nofile: 7 184 | nproc: 23 185 | _as: -1 186 | -------------------------------------------------------------------------------- /src/evaluations/resource_limit.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, fields 2 | 3 | 4 | @dataclass(kw_only=True) 5 | class ResourceLimits: 6 | core: int = 0 # RLIMIT_CORE 7 | data: int = -1 # RLIMIT_DATA 8 | # nice: int = 20 # RLIMIT_NICE 9 | fsize: int = 0 # RLIMIT_FSIZE 10 | sigpending: int = 0 # RLIMIT_SIGPENDING 11 | # memlock: int = -1 # RLIMIT_MEMLOCK 12 | rss: int = -1 # RLIMIT_RSS 13 | nofile: int = 4 # RLIMIT_NOFILE 14 | msgqueue: int = 0 # RLIMIT_MSGQUEUE 15 | rtprio: int = 0 # RLIMIT_RTPRIO 16 | stack: int = -1 # RLIMIT_STACK 17 | cpu: int = 2 # RLIMIT_CPU, CPU time, in seconds. 18 | nproc: int = 1 # RLIMIT_NPROC 19 | _as: int = 2 * 1024 ** 3 # RLIMIT_AS set to 2GB by default 20 | locks: int = 0 # RLIMIT_LOCKS 21 | # rttime: int = 2 # RLIMIT_RTTIME, Timeout for real-time tasks. 22 | 23 | def fields(self): 24 | for field in fields(self): 25 | yield field.name 26 | 27 | 28 | if __name__ == "__main__": 29 | limits = ResourceLimits() 30 | prlimit_str = " ".join( 31 | f"--{field.name[1:] if field.name.startswith('_') else field.name}={getattr(limits, field.name)}" 32 | for field in fields(limits) 33 | ) 34 | print(prlimit_str) 35 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import dotenv 2 | dotenv.load_dotenv() 3 | 4 | import argparse 5 | import sys 6 | from datetime import datetime 7 | from constants.paths import * 8 | 9 | from models.Gemini import Gemini 10 | from models.OpenAI import OpenAIModel 11 | 12 | from results.Results import Results 13 | 14 | from promptings.PromptingFactory import PromptingFactory 15 | from datasets.DatasetFactory import DatasetFactory 16 | from models.ModelFactory import ModelFactory 17 | 18 | from constants.verboseType import * 19 | 20 | from utils.summary import gen_summary 21 | from utils.runEP import run_eval_plus 22 | from utils.evaluateET import generate_et_dataset_human 23 | from utils.evaluateET import generate_et_dataset_mbpp 24 | from utils.generateEP import generate_ep_dataset_human 25 | from utils.generateEP import generate_ep_dataset_mbpp 26 | 27 | parser = argparse.ArgumentParser() 28 | 29 | parser.add_argument( 30 | "--dataset", 31 | type=str, 32 | default="HumanEval", 33 | choices=[ 34 | "HumanEval", 35 | "MBPP", 36 | "APPS", 37 | "xCodeEval", 38 | "CC", 39 | ] 40 | ) 41 | parser.add_argument( 42 | "--strategy", 43 | type=str, 44 | default="Direct", 45 | choices=[ 46 | "Direct", 47 | "CoT", 48 | "SelfPlanning", 49 | "Analogical", 50 | "MapCoder", 51 | "CodeSIM", 52 | "CodeSIMWD", 53 | "CodeSIMWPV", 54 | "CodeSIMWPVD", 55 | "CodeSIMA", 56 | "CodeSIMC", 57 | ] 58 | ) 59 | parser.add_argument( 60 | "--model", 61 | type=str, 62 | default="ChatGPT", 63 | ) 64 | parser.add_argument( 65 | "--model_provider", 66 | type=str, 67 | default="OpenAI", 68 | ) 69 | parser.add_argument( 70 | "--temperature", 71 | type=float, 72 | default=0 73 | ) 74 | parser.add_argument( 75 | "--top_p", 76 | type=float, 77 | default=0.95 78 | ) 79 | parser.add_argument( 80 | "--pass_at_k", 81 | type=int, 82 | default=1 83 | ) 84 | parser.add_argument( 85 | "--language", 86 | type=str, 87 | default="Python3", 88 | choices=[ 89 | "C", 90 | "C#", 91 | "C++", 92 | "Go", 93 | "PHP", 94 | "Python3", 95 | "Ruby", 96 | "Rust", 97 | ] 98 | ) 99 | 100 | parser.add_argument( 101 | "--cont", 102 | type=str, 103 | default="yes", 104 | choices=[ 105 | "yes", 106 | "no" 107 | ] 108 | ) 109 | 110 | parser.add_argument( 111 | "--result_log", 112 | type=str, 113 | default="partial", 114 | choices=[ 115 | "full", 116 | "partial" 117 | ] 118 | ) 119 | 120 | parser.add_argument( 121 | "--verbose", 122 | type=str, 123 | default="2", 124 | choices=[ 125 | "2", 126 | "1", 127 | "0", 128 | ] 129 | ) 130 | 131 | parser.add_argument( 132 | "--store_log_in_file", 133 | type=str, 134 | default="yes", 135 | choices=[ 136 | "yes", 137 | "no", 138 | ] 139 | ) 140 | 141 | args = parser.parse_args() 142 | 143 | DATASET = args.dataset 144 | STRATEGY = args.strategy 145 | MODEL_NAME = args.model 146 | MODEL_PROVIDER_NAME = args.model_provider 147 | TEMPERATURE = args.temperature 148 | TOP_P = args.top_p 149 | PASS_AT_K = args.pass_at_k 150 | LANGUAGE = args.language 151 | CONTINUE = args.cont 152 | RESULT_LOG_MODE = args.result_log 153 | VERBOSE = int(args.verbose) 154 | STORE_LOG_IN_FILE = args.store_log_in_file 155 | 156 | MODEL_NAME_FOR_RUN = MODEL_NAME 157 | 158 | RUN_NAME = f"results/{DATASET}/{STRATEGY}/{MODEL_NAME_FOR_RUN}/{LANGUAGE}-{TEMPERATURE}-{TOP_P}-{PASS_AT_K}" 159 | 160 | run_no = 1 161 | while os.path.exists(f"{RUN_NAME}/Run-{run_no}"): 162 | run_no += 1 163 | 164 | if CONTINUE == "yes" and run_no > 1: 165 | run_no -= 1 166 | 167 | RUN_NAME = f"{RUN_NAME}/Run-{run_no}" 168 | 169 | if not os.path.exists(RUN_NAME): 170 | os.makedirs(RUN_NAME) 171 | 172 | RESULTS_PATH = f"{RUN_NAME}/Results.jsonl" 173 | SUMMARY_PATH = f"{RUN_NAME}/Summary.txt" 174 | LOGS_PATH = f"{RUN_NAME}/Log.txt" 175 | 176 | if STORE_LOG_IN_FILE.lower() == 'yes': 177 | sys.stdout = open( 178 | LOGS_PATH, 179 | mode="a", 180 | encoding="utf-8" 181 | ) 182 | 183 | if CONTINUE == "no" and VERBOSE >= VERBOSE_MINIMAL: 184 | print(f""" 185 | ################################################## 186 | Experiment start {RUN_NAME}, Time: {datetime.now()} 187 | ################################################### 188 | """) 189 | 190 | strategy = PromptingFactory.get_prompting_class(STRATEGY)( 191 | model=ModelFactory.get_model_class(MODEL_PROVIDER_NAME)( 192 | model_name=MODEL_NAME, 193 | temperature=TEMPERATURE, 194 | top_p=TOP_P 195 | ), 196 | data=DatasetFactory.get_dataset_class(DATASET)(), 197 | language=LANGUAGE, 198 | pass_at_k=PASS_AT_K, 199 | results=Results(RESULTS_PATH), 200 | verbose=VERBOSE 201 | ) 202 | 203 | strategy.run(RESULT_LOG_MODE.lower() == 'full') 204 | 205 | if VERBOSE >= VERBOSE_MINIMAL: 206 | print(f""" 207 | ################################################## 208 | Experiment end {RUN_NAME}, Time: {datetime.now()} 209 | ################################################### 210 | """) 211 | 212 | gen_summary(RESULTS_PATH, SUMMARY_PATH) 213 | 214 | ET_RESULTS_PATH = f"{RUN_NAME}/Results-ET.jsonl" 215 | ET_SUMMARY_PATH = f"{RUN_NAME}/Summary-ET.txt" 216 | 217 | EP_RESULTS_PATH = f"{RUN_NAME}/Results-EP.jsonl" 218 | EP_SUMMARY_PATH = f"{RUN_NAME}/Summary-EP.txt" 219 | 220 | if "human" in DATASET.lower(): 221 | generate_et_dataset_human(RESULTS_PATH, ET_RESULTS_PATH) 222 | gen_summary(ET_RESULTS_PATH, ET_SUMMARY_PATH) 223 | 224 | # generate_ep_dataset_human(RESULTS_PATH, EP_RESULTS_PATH) 225 | # run_eval_plus(EP_RESULTS_PATH, EP_SUMMARY_PATH, "humaneval") 226 | 227 | elif "mbpp" in DATASET.lower(): 228 | generate_et_dataset_mbpp(RESULTS_PATH, ET_RESULTS_PATH) 229 | gen_summary(ET_RESULTS_PATH, ET_SUMMARY_PATH) 230 | 231 | # generate_ep_dataset_human(RESULTS_PATH, EP_RESULTS_PATH) 232 | # run_eval_plus(EP_RESULTS_PATH, EP_SUMMARY_PATH, "mbpp") 233 | 234 | if STORE_LOG_IN_FILE.lower() == 'yes': 235 | sys.stdout.close() 236 | 237 | -------------------------------------------------------------------------------- /src/models/Anthropic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import base64 4 | import json 5 | 6 | from tenacity import retry, stop_after_attempt, wait_random_exponential 7 | 8 | from .Base import BaseModel 9 | 10 | import os 11 | from openai import OpenAI, AzureOpenAI 12 | import time 13 | 14 | usage_log_file_path = "anthropic_usage_log.csv" 15 | 16 | 17 | class AnthropicModel(BaseModel): 18 | def __init__( 19 | self, 20 | model_name, 21 | sleep_time=0, 22 | **kwargs 23 | ): 24 | if model_name is None: 25 | raise Exception("Model name is required") 26 | 27 | self.model_name = f"anthropic/{model_name}" 28 | 29 | self.client = OpenAI( 30 | base_url="https://openrouter.ai/api/v1", 31 | api_key=os.getenv("OPENROUTER_API_KEY"), 32 | ) 33 | 34 | self.temperature = kwargs.get("temperature", 0.0) 35 | self.top_p = kwargs.get("top_p", 0.95) 36 | self.max_tokens = kwargs.get("max_tokens", 8000) 37 | 38 | self.sleep_time = sleep_time 39 | 40 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5)) 41 | def prompt( 42 | self, 43 | processed_input: list[dict], 44 | frequency_penalty=0, 45 | presence_penalty=0 46 | ): 47 | time.sleep(self.sleep_time) 48 | 49 | start_time = time.perf_counter() 50 | 51 | response = self.client.chat.completions.create( 52 | model=self.model_name, 53 | messages=[ 54 | { 55 | "role": "user", 56 | "content": [ 57 | { 58 | "type": "text", 59 | "text": processed_input[0]["content"] 60 | }, 61 | ] 62 | } 63 | ], 64 | temperature=self.temperature, 65 | top_p=self.top_p, 66 | max_tokens=self.max_tokens, 67 | frequency_penalty=frequency_penalty, 68 | presence_penalty=presence_penalty, 69 | ) 70 | print(response.choices[0].message.content) 71 | 72 | end_time = time.perf_counter() 73 | 74 | with open(usage_log_file_path, mode="a") as file: 75 | file.write(f'{self.model_name},{response.usage.prompt_tokens},{response.usage.completion_tokens}\n') 76 | 77 | run_details = { 78 | "api_calls": 1, 79 | "taken_time": end_time - start_time, 80 | 81 | "prompt_tokens": response.usage.prompt_tokens, 82 | "completion_tokens": response.usage.completion_tokens, 83 | 84 | "details": [ 85 | { 86 | "model_name": self.model_name, 87 | "model_prompt": processed_input, 88 | "model_response": response.choices[0].message.content, 89 | "max_tokens": self.max_tokens, 90 | "temperature": self.temperature, 91 | "top_p": self.top_p, 92 | "frequency_penalty": frequency_penalty, 93 | "presence_penalty": presence_penalty 94 | } 95 | ], 96 | } 97 | 98 | return response.choices[0].message.content, run_details 99 | -------------------------------------------------------------------------------- /src/models/Base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | import traceback 5 | from abc import ABC, abstractmethod 6 | 7 | 8 | class BaseModel(ABC): 9 | def __init__(self, **kwargs): 10 | self.sleep_time = 0 11 | 12 | 13 | @abstractmethod 14 | # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5)) 15 | def prompt(self, processed_input, frequency_penalty=0, presence_penalty=0): 16 | pass 17 | 18 | -------------------------------------------------------------------------------- /src/models/Gemini.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | import os 3 | import google.generativeai as genai 4 | import dotenv 5 | import time 6 | from tenacity import retry, stop_after_attempt, wait_random_exponential 7 | 8 | from .Base import BaseModel 9 | 10 | api_key = os.getenv("GEMINI_API_KEY") 11 | 12 | genai.configure(api_key=api_key) 13 | 14 | class Gemini(BaseModel): 15 | def __init__(self, model_name, sleep_time=0, **kwargs): 16 | if model_name is None: 17 | raise Exception("Model name is required") 18 | 19 | self.model_name = kwargs.get("model_name", model_name) 20 | self.api_key = kwargs.get("api_key", api_key) 21 | self.temperature = kwargs.get("temperature", 0.0) 22 | self.top_p = kwargs.get("top_p", 0.95) 23 | self.max_tokens = kwargs.get("max_tokens", 8192) 24 | self.sleep_time = sleep_time 25 | 26 | genai.configure(api_key=api_key) 27 | 28 | # Create the model 29 | generation_config = { 30 | "temperature": self.temperature, 31 | "top_p": self.top_p, 32 | "top_k": 64, 33 | "max_output_tokens": self.max_tokens, 34 | "response_mime_type": "text/plain", 35 | } 36 | 37 | self.model = genai.GenerativeModel( 38 | model_name=model_name, 39 | generation_config=generation_config, 40 | ) 41 | 42 | 43 | @retry(wait=wait_random_exponential(min=1, max=120), stop=stop_after_attempt(20)) 44 | def prompt( 45 | self, 46 | processed_input, 47 | frequency_penalty=0, 48 | presence_penalty=0 49 | ): 50 | 51 | time.sleep(self.sleep_time) 52 | 53 | start_time = time.perf_counter() 54 | 55 | response = self.model.generate_content(processed_input[0]['content']) 56 | 57 | end_time = time.perf_counter() 58 | 59 | run_details = { 60 | "api_calls": 1, 61 | "taken_time": end_time - start_time, 62 | 63 | "prompt_tokens": response.usage_metadata.prompt_token_count, 64 | "completion_tokens": response.usage_metadata.candidates_token_count, 65 | "cost": 0, 66 | 67 | "details": [ 68 | { 69 | "model_name": model_name, 70 | "model_prompt": processed_input, 71 | "model_response": response.candidates[0].content.parts[0].text, 72 | "max_tokens": self.max_tokens, 73 | "temperature": self.temperature, 74 | "top_p": self.top_p, 75 | "frequency_penalty": 0, 76 | "presence_penalty": 0 77 | } 78 | ], 79 | } 80 | 81 | return response.text, run_details 82 | 83 | 84 | if __name__ == "__main__": 85 | # Load your API key from the environment variable 86 | # Create a Gemini instance 87 | gemini = Gemini() 88 | 89 | # Sample API call 90 | processed_input = [{"content": "Tell me a joke."}] 91 | response, run_details = gemini.prompt(processed_input) 92 | 93 | print(response) 94 | print(run_details) 95 | 96 | -------------------------------------------------------------------------------- /src/models/GroqModel.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | import os 3 | import requests 4 | import base64 5 | import json 6 | 7 | from tenacity import retry, stop_after_attempt, wait_random_exponential 8 | 9 | from .Base import BaseModel 10 | 11 | api_key = os.getenv("GROQ_API_KEY") 12 | 13 | import os 14 | import time 15 | from groq import Groq 16 | 17 | 18 | class GroqModel(BaseModel): 19 | def __init__(self, model_name, sleep_time=0, **kwargs): 20 | if model_name is None: 21 | raise Exception("Model name is required") 22 | 23 | self.model_name = model_name 24 | self.temperature = kwargs.get("temperature", 0.0) 25 | self.top_p = kwargs.get("top_p", 0.95) 26 | self.max_tokens = kwargs.get("max_tokens", 16000) 27 | self.sleep_time = sleep_time 28 | self.api_key = api_key 29 | 30 | self.client = Groq(api_key=self.api_key) 31 | 32 | @retry(wait=wait_random_exponential(min=600, max=3600), stop=stop_after_attempt(5)) 33 | def prompt( 34 | self, 35 | processed_input: List[Dict], 36 | frequency_penalty=0.0, 37 | presence_penalty=0.0, 38 | ) -> tuple[str, dict]: 39 | 40 | time.sleep(self.sleep_time) 41 | 42 | start_time = time.perf_counter() 43 | 44 | response = self.client.chat.completions.create( 45 | messages=processed_input, 46 | model=self.model_name, 47 | max_tokens=self.max_tokens 48 | ) 49 | 50 | end_time = time.perf_counter() 51 | 52 | run_details = { 53 | "api_calls": 1, 54 | "taken_time": end_time - start_time, 55 | 56 | "prompt_tokens": response.usage.prompt_tokens, 57 | "completion_tokens": response.usage.completion_tokens, 58 | "cost": 0, 59 | 60 | "details": [ 61 | { 62 | "model_name": self.model_name, 63 | "model_prompt": processed_input, 64 | "model_response": response.choices[0].message.content, 65 | "max_tokens": self.max_tokens, 66 | "temperature": self.temperature, 67 | "top_p": self.top_p, 68 | "frequency_penalty": frequency_penalty, 69 | "presence_penalty": presence_penalty 70 | } 71 | ], 72 | } 73 | 74 | return response.choices[0].message.content, run_details 75 | 76 | -------------------------------------------------------------------------------- /src/models/ModelFactory.py: -------------------------------------------------------------------------------- 1 | from models.Anthropic import * 2 | from models.Gemini import * 3 | from models.OpenAI import * 4 | from models.GroqModel import * 5 | 6 | class ModelFactory: 7 | @staticmethod 8 | def get_model_class(model_provider_name: str): 9 | model_provider_name = model_provider_name.lower() 10 | if model_provider_name == "gemini": 11 | return Gemini 12 | elif model_provider_name == "openai": 13 | return OpenAIV1Model 14 | elif model_provider_name == "openai-v2": 15 | return OpenAIV2Model 16 | elif model_provider_name == "groq": 17 | return GroqModel 18 | elif model_provider_name == "anthropic": 19 | return AnthropicModel 20 | else: 21 | raise Exception(f"Unknown model provider name {model_provider_name}") 22 | -------------------------------------------------------------------------------- /src/models/OpenAI.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import base64 4 | import json 5 | 6 | from tenacity import retry, stop_after_attempt, wait_random_exponential 7 | 8 | from .Base import BaseModel 9 | 10 | import os 11 | from openai import OpenAI, AzureOpenAI 12 | import time 13 | 14 | usage_log_file_path = "usage_log.csv" 15 | api_type = os.getenv("API_TYPE") 16 | 17 | if api_type == "openai": 18 | api_key = os.getenv("OPENAI_API_KEY") 19 | api_base = os.getenv("OPENAI_API_URL") 20 | elif api_type == "azure": 21 | api_key = os.getenv("AZURE_API_KEY") 22 | api_base = os.getenv("AZURE_API_URL") 23 | api_version = os.getenv("AZURE_API_VERSION") 24 | 25 | 26 | class OpenAIModel(BaseModel): 27 | def __init__( 28 | self, 29 | **kwargs 30 | ): 31 | pass 32 | 33 | def prompt( 34 | self, 35 | processed_input: list[dict], 36 | frequency_penalty=0, 37 | presence_penalty=0 38 | ): 39 | pass 40 | 41 | 42 | class OpenAIV1Model(OpenAIModel): 43 | def __init__(self, model_name, sleep_time=0, **kwargs): 44 | 45 | if model_name is None: 46 | raise Exception("Model name is required") 47 | 48 | if api_type == "azure": 49 | self.client = AzureOpenAI( 50 | api_key=api_key, 51 | api_version=api_version, 52 | azure_endpoint=api_base 53 | ) 54 | else: 55 | self.client = OpenAI(api_key=api_key) 56 | 57 | self.model_name = model_name 58 | 59 | self.temperature = kwargs.get("temperature", 0.0) 60 | self.top_p = kwargs.get("top_p", 0.95) 61 | self.max_tokens = kwargs.get("max_tokens", 4096) 62 | 63 | self.sleep_time = sleep_time 64 | 65 | 66 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5)) 67 | def prompt( 68 | self, 69 | processed_input: list[dict], 70 | frequency_penalty=0, 71 | presence_penalty=0 72 | ): 73 | 74 | time.sleep(self.sleep_time) 75 | 76 | start_time = time.perf_counter() 77 | 78 | if self.model_name == "o3-mini" or self.model_name == "o1": 79 | response = self.client.chat.completions.create( 80 | model=self.model_name, 81 | messages=processed_input, 82 | max_completion_tokens=self.max_tokens, 83 | stop=None, 84 | stream=False 85 | ) 86 | else: 87 | response = self.client.chat.completions.create( 88 | model=self.model_name, 89 | messages=processed_input, 90 | max_tokens=self.max_tokens, 91 | temperature=self.temperature, 92 | top_p=self.top_p, 93 | frequency_penalty=frequency_penalty, 94 | presence_penalty=presence_penalty, 95 | stop=None, 96 | stream=False 97 | ) 98 | 99 | end_time = time.perf_counter() 100 | 101 | with open(usage_log_file_path, mode="a") as file: 102 | file.write(f'{self.model_name},{response.usage.prompt_tokens},{response.usage.completion_tokens}\n') 103 | 104 | run_details = { 105 | "api_calls": 1, 106 | "taken_time": end_time - start_time, 107 | 108 | "prompt_tokens": response.usage.prompt_tokens, 109 | "completion_tokens": response.usage.completion_tokens, 110 | 111 | "details": [ 112 | { 113 | "model_name": self.model_name, 114 | "model_prompt": processed_input, 115 | "model_response": response.choices[0].message.content, 116 | "max_tokens": self.max_tokens, 117 | "temperature": self.temperature, 118 | "top_p": self.top_p, 119 | "frequency_penalty": frequency_penalty, 120 | "presence_penalty": presence_penalty 121 | } 122 | ], 123 | } 124 | 125 | return response.choices[0].message.content, run_details 126 | 127 | # This class is intended for only azure openai api for some special cases 128 | # Do not use this class for openai api 129 | class OpenAIV2Model(OpenAIModel): 130 | def __init__(self, model_name, sleep_time=60, **kwargs): 131 | if model_name is None: 132 | raise Exception("Model name is required") 133 | 134 | self.model_name = model_name 135 | 136 | self.headers = { 137 | "Content-Type": "application/json", 138 | "api-key": kwargs.get("api-key", api_key), 139 | } 140 | self.end_point = kwargs.get("end_point", api_base) 141 | 142 | self.temperature = kwargs.get("temperature", 0.0) 143 | self.top_p = kwargs.get("top_p", 0.95) 144 | self.max_tokens = kwargs.get("max_tokens", 4096) 145 | 146 | self.sleep_time = sleep_time 147 | 148 | 149 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5)) 150 | def prompt( 151 | self, 152 | processed_input: list[dict], 153 | frequency_penalty=0, 154 | presence_penalty=0 155 | ): 156 | 157 | time.sleep(self.sleep_time) 158 | 159 | 160 | # Payload for the request 161 | payload = { 162 | "messages": processed_input, 163 | "temperature": self.temperature, 164 | "top_p": self.top_p, 165 | "max_tokens": self.max_tokens, 166 | "frequency_penalty": frequency_penalty, 167 | "presence_penalty": presence_penalty 168 | } 169 | 170 | start_time = time.perf_counter() 171 | 172 | response = requests.post(self.end_point, headers=self.headers, json=payload) 173 | # Will raise an HTTPError if the HTTP request returned an unsuccessful status code 174 | response.raise_for_status() 175 | 176 | end_time = time.perf_counter() 177 | 178 | # Handle the response as needed (e.g., print or process) 179 | response = response.json() 180 | 181 | with open(usage_log_file_path, mode="a") as file: 182 | file.write(f'{self.model_name},{response["usage"]["prompt_tokens"]},{response["usage"]["completion_tokens"]}\n') 183 | 184 | run_details = { 185 | "api_calls": 1, 186 | "taken_time": end_time - start_time, 187 | 188 | "prompt_tokens": response["usage"]["prompt_tokens"], 189 | "completion_tokens": response["usage"]["completion_tokens"], 190 | 191 | "details": [ 192 | { 193 | "model_name": self.model_name, 194 | "model_prompt": processed_input, 195 | "model_response": response["choices"][0]["message"]["content"], 196 | "max_tokens": self.max_tokens, 197 | "temperature": self.temperature, 198 | "top_p": self.top_p, 199 | "frequency_penalty": frequency_penalty, 200 | "presence_penalty": presence_penalty 201 | } 202 | ], 203 | } 204 | 205 | return response["choices"][0]["message"]["content"], run_details 206 | 207 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/src/models/__init__.py -------------------------------------------------------------------------------- /src/promptings/Analogical.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import re 5 | from copy import deepcopy 6 | 7 | from .Base import BaseStrategy 8 | from models.Base import BaseModel 9 | from datasets.Dataset import Dataset 10 | from results.Results import Results 11 | 12 | # self-generate exemplars and knowledge 13 | class AnalogicalStrategy(BaseStrategy): 14 | 15 | def run_single_pass(self, data_row: dict): 16 | input = [ 17 | { 18 | "role": "user", 19 | "content": 20 | f"""Your goal is to write {self.language} code to solve competitive programming problems. Given a problem , explain the core concepts in it and provide other relevant problems. Then solve the original problem. 21 | 22 | # Problem: 23 | {self.data.get_prompt(data_row)} 24 | 25 | # Instruction: (Your response must include the following points sequentially) 26 | 27 | ## Algorithms: 28 | Identify the core concepts or algorithms used to solve the problem. 29 | 30 | ## Tutorial: 31 | Write a useful tutorial about these algorithms. 32 | 33 | ## Example Problems: 34 | Provide three examples of relevant competitive programming problems that involve these algorithms. For each problem , describe the problem , explain the solution in detail , and then write the correct Python3 code. 35 | 36 | ## {self.language} code to solve the original problem: 37 | Include the following points in your response: 38 | - Explanation of the solution: 39 | - {self.language} code to solve the problem (inside ``` ``` block):""", 40 | }, 41 | ] 42 | 43 | return self.gpt_chat( 44 | processed_input=input 45 | ) 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/promptings/Base.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import copy 5 | import time 6 | 7 | from models.Base import BaseModel 8 | from datasets.Dataset import Dataset 9 | from results.Results import Results 10 | from utils.parse import parse_response 11 | from time import perf_counter_ns 12 | from constants.verboseType import * 13 | 14 | class BaseStrategy(object): 15 | def __init__( 16 | self, 17 | model: BaseModel, 18 | data: Dataset, 19 | language: str, 20 | pass_at_k: int, 21 | results: Results, 22 | verbose: int = VERBOSE_FULL, 23 | ): 24 | self.model = model 25 | self.data = data 26 | self.pass_at_k = pass_at_k 27 | self.results = results 28 | self.language = language 29 | self.verbose = verbose 30 | self.run_details = [] 31 | 32 | 33 | def append_run_details(self, run_details: dict): 34 | for key in run_details.keys(): 35 | if key in self.run_details: 36 | self.run_details[key] += run_details[key] 37 | else: 38 | self.run_details[key] = run_details[key] 39 | 40 | 41 | def gpt_chat( 42 | self, 43 | processed_input: List[dict], 44 | frequency_penalty=0, 45 | presence_penalty=0 46 | ): 47 | 48 | response, run_details = self.model.prompt( 49 | processed_input=processed_input, 50 | frequency_penalty=frequency_penalty, 51 | presence_penalty=presence_penalty 52 | ) 53 | self.append_run_details(run_details) 54 | 55 | return response 56 | 57 | 58 | def run_single_pass(self, data_row: dict): 59 | pass 60 | 61 | def run(self, record_full_result): 62 | # self.data.data.reverse() 63 | 64 | num_items = len(self.data) 65 | num_success = 0 66 | 67 | for i, data_row in enumerate(self.data): 68 | if self.verbose >= VERBOSE_FULL: 69 | print("", flush=True, end="") 70 | 71 | found = False 72 | for j in range(len(self.results)): 73 | if self.results[j]["task_id"] == data_row[self.data.id_key]: 74 | item = copy.deepcopy(self.results[j]) 75 | cur_pass = len(item["source_codes"]) 76 | is_solved = item["is_solved"] 77 | cur_imp = item["source_codes"][-1] 78 | found = True 79 | break 80 | if not found: 81 | item = { 82 | self.data.id_key: data_row[self.data.id_key], 83 | "task_id": data_row[self.data.id_key], 84 | "language": self.language, 85 | "source_codes": [], 86 | "run_details": [], 87 | "no_of_try": 0, 88 | } 89 | 90 | cur_pass = 0 91 | is_solved = False 92 | cur_imp = "" 93 | 94 | while cur_pass < self.pass_at_k and not is_solved: 95 | # initialize it for each run 96 | self.run_details = {} 97 | # for _ in range(10): 98 | # try: 99 | response = self.run_single_pass(data_row) 100 | # break 101 | # except Exception as e: 102 | # time.sleep(5) 103 | # pass 104 | 105 | cur_imp = parse_response(response) 106 | 107 | item["source_codes"].append(cur_imp) 108 | 109 | # Remove Full details 110 | if not record_full_result: 111 | del self.run_details["details"] 112 | 113 | item["run_details"].append(self.run_details) 114 | 115 | item["no_of_try"] += 1 116 | 117 | is_solved = self.data.evaluate( 118 | item=data_row, 119 | cur_imp=cur_imp, 120 | language=self.language 121 | ) 122 | 123 | cur_pass += 1 124 | 125 | if is_solved: 126 | num_success += 1 127 | 128 | item["is_solved"] = is_solved 129 | 130 | self.results.get_results().insert(i, item) 131 | 132 | # Deleting duplicate results 133 | k = i + 1 134 | while True: 135 | # Termination condition 136 | if k >= len(self.results): 137 | break 138 | 139 | # Deleting duplicate results 140 | if self.results[k]["task_id"] == data_row[self.data.id_key]: 141 | del self.results.results[k] 142 | 143 | # Increment 144 | k += 1 145 | 146 | if self.verbose >= VERBOSE_MINIMAL: 147 | print(f'completed {i+1}/{num_items}, Solved: {self.results[i]["is_solved"]}, number of success = {num_success}/{i+1}, acc = {round(num_success/(i+1)*100, 2)}') 148 | 149 | if not found: 150 | self.results.save_results() 151 | 152 | if self.verbose >= VERBOSE_FULL: 153 | print("", flush=True, end="") 154 | 155 | 156 | if len(self.results) > len(self.data): 157 | self.results.results = self.results[:len(self.data)] 158 | self.results.save_results() 159 | -------------------------------------------------------------------------------- /src/promptings/CodeSIM.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import json 5 | import re 6 | import sys 7 | import time 8 | 9 | from copy import deepcopy 10 | import xml.etree.ElementTree as ET 11 | 12 | from .Base import BaseStrategy 13 | from .Direct import DirectStrategy 14 | from models.Base import BaseModel 15 | 16 | from datasets.Dataset import Dataset 17 | from datasets.APPSDataset import APPSDataset 18 | from datasets.MBPPDataset import MBPPDataset 19 | from datasets.XCodeDataset import XCodeDataset 20 | from datasets.HumanEvalDataset import HumanDataset 21 | from datasets.CodeContestDataset import CodeContestDataset 22 | 23 | from evaluations.func_evaluate import evaluate_io 24 | 25 | from utils.parse import parse_response 26 | from constants.verboseType import * 27 | 28 | class CodeSIM(DirectStrategy): 29 | def __init__( 30 | self, 31 | additional_info_run=0, 32 | max_plan_try=5, 33 | max_debug_try=5, 34 | *args, 35 | **kwargs 36 | ): 37 | super().__init__(*args, **kwargs) 38 | 39 | 40 | self.additional_info_run=additional_info_run 41 | self.max_plan_try=max_plan_try 42 | self.max_debug_try=max_debug_try 43 | 44 | self.is_competitive = type(self.data) == APPSDataset or \ 45 | type(self.data) == CodeContestDataset or \ 46 | type(self.data) == XCodeDataset 47 | 48 | 49 | if self.verbose >= VERBOSE_FULL: 50 | print("\n\n" + "_" * 70) 51 | print(f"Running CodeSIM with additional_info_run={additional_info_run}, max_plan_try={self.max_plan_try}, max_debug_try={self.max_debug_try}") 52 | print("\n", flush=True) 53 | 54 | 55 | @staticmethod 56 | def get_sample_io_str(sample_io: any) -> str: 57 | if len(sample_io) > 0: 58 | if type(sample_io[0]) == str: 59 | return "\n".join(sample_io) 60 | if type(sample_io[0]) == dict: 61 | return "\n".join([f"Input:\n{io['input']}\nExpected output:\n{io['output'][0]}" for io in sample_io]) 62 | return sample_io 63 | 64 | 65 | @staticmethod 66 | def process_test_log(test_logs: str): 67 | passed_test_cases = [] 68 | failed_test_cases = [] 69 | for test_log in test_logs.splitlines(): 70 | if test_log.startswith("Passed"): 71 | passed_test_cases.append(test_log[test_log.index("assert"):]) 72 | if test_log.startswith("Failed"): 73 | failed_test_cases.append(test_log[test_log.index("assert"):]) 74 | 75 | failed_test_cases_str = "\n".join(failed_test_cases) 76 | return f"### Test Cases where the generated code failed to generate the expected output:\n{failed_test_cases_str}" 77 | 78 | 79 | def parse_test_cases(self, test_cases: str): 80 | return [ 81 | test_case 82 | for test_case in test_cases.splitlines() 83 | if len(test_case) > 0 and test_case.startswith("assert") 84 | ] 85 | 86 | 87 | def check( 88 | self, 89 | data_row: dict, 90 | additional_io: List[str], 91 | code: str 92 | ) -> bool: 93 | passed_sample, test_log_sample = self.data.evaluate_sample_io( 94 | data_row, 95 | code, 96 | self.language 97 | ) 98 | 99 | passed_additional, test_log_additional = self.data.evaluate_additional_io( 100 | data_row[self.data.id_key], 101 | additional_io, 102 | code, 103 | self.language 104 | ) 105 | 106 | if self.is_competitive: 107 | test_log_sample = test_log_sample[test_log_sample.find("## Tests failed:"):] 108 | test_log = test_log_sample + test_log_additional 109 | else: 110 | test_log = self.process_test_log(test_log_sample + test_log_additional) 111 | 112 | return passed_sample & passed_additional, test_log 113 | 114 | 115 | def run_single_pass(self, data_row: dict): 116 | print("", flush=True) 117 | 118 | problem = self.data.get_prompt(data_row) 119 | 120 | std_input_prompt = "" 121 | 122 | if self.is_competitive: 123 | std_input_prompt = \ 124 | """- Strictly follow the sample input and output format. 125 | - The input should be taken from Standard input and output should be given to standard output. If you are writing a function then after the function definition take the input using `input()` function then call the function with specified parameters and finally print the output of the function. 126 | - For array input parse the array then pass it to the function. Parsing technique is given in the sample input output format section. 127 | - Do not add extra print statement otherwise it will failed the test cases.""" 128 | 129 | problem = problem[:problem.find("-------\nImportant Note:")] 130 | 131 | additional_io = [] 132 | 133 | # if type(self.data) == MBPPDataset: 134 | 135 | # # Additional IO collection 136 | # for idx in range(1, self.additional_info_run + 1): 137 | # # Additional IO 138 | # additional_io_generation_input = [ 139 | # { 140 | # "role": "user", 141 | # "content": prompt_for_additional_io.format( 142 | # problem=problem, 143 | # problem_name=data_row["entry_point"], 144 | # ), 145 | # }, 146 | # ] 147 | 148 | # if self.verbose >= VERBOSE_FULL: 149 | # print("\n\n" + "_" * 70) 150 | # print(f"Input for Additional IO Generation: {idx}\n\n") 151 | # print(additional_io_generation_input[0]['content'], flush=True) 152 | 153 | # response = self.gpt_chat( 154 | # processed_input=additional_io_generation_input, 155 | # frequency_penalty=0.2 156 | # ) 157 | 158 | # if self.verbose >= VERBOSE_FULL: 159 | # print("\n\n" + "_" * 70) 160 | # print(f"Response from Additional IO Generation: {idx}\n\n") 161 | # print(response, flush=True) 162 | 163 | # additional_io_response = response 164 | 165 | # # Applying intersection for self-consistancy 166 | # if additional_io is None: 167 | # additional_io = set(self.parse_test_cases( 168 | # test_cases=additional_io_response 169 | # )) 170 | # else: 171 | # additional_io_ = self.parse_test_cases( 172 | # test_cases=additional_io_response 173 | # ) 174 | # additional_io = additional_io.intersection(set(additional_io_)) 175 | 176 | # additional_io = list(additional_io) 177 | # if self.verbose >= VERBOSE_FULL: 178 | # print(f"Additional IOs:") 179 | # print(additional_io, flush=True) 180 | 181 | # # Forcing no sample io as MBPP contains no sample io 182 | # data_row['sample_io'] = [] 183 | 184 | # else: 185 | # additional_io = [] 186 | 187 | self.run_details["additional_io"] = additional_io 188 | 189 | 190 | # Planning, Coding, Debugging 191 | for plan_no in range(1, self.max_plan_try + 1): 192 | # Planning Phase 193 | 194 | # if self.is_competative: 195 | input_for_planning = [ 196 | { 197 | "role": "user", 198 | "content": prompt_for_planning.format( 199 | problem=problem, 200 | language=self.language, 201 | ) 202 | }, 203 | ] 204 | # else: 205 | # input_for_planning = [ 206 | # { 207 | # "role": "user", 208 | # "content": prompt_for_planning.format( 209 | # problem=problem, 210 | # language=self.language, 211 | # ) 212 | # }, 213 | # ] 214 | 215 | if self.verbose >= VERBOSE_FULL: 216 | print("\n\n" + "_" * 70) 217 | print(f"Input for Planning: {plan_no}\n\n") 218 | print(input_for_planning[0]['content'], flush=True) 219 | 220 | response = self.gpt_chat( 221 | processed_input=input_for_planning 222 | ) 223 | 224 | if self.verbose >= VERBOSE_FULL: 225 | print("\n\n" + "_" * 70) 226 | print(f"Response from Planning: {plan_no}\n\n") 227 | print(response, flush=True) 228 | 229 | # if "```" in response: 230 | # plan = parse_response(response) 231 | # else: 232 | # plan = response[response.find("### Plan"):] 233 | 234 | if "### Plan" not in response: 235 | plan = f"### Plan\n\n{response}" 236 | else: 237 | plan = response[response.rfind("### Plan"):] 238 | 239 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 240 | 241 | # Simulation Phase 242 | input_for_simulation = [ 243 | { 244 | "role": "user", 245 | "content": prompt_for_simulation.format( 246 | problem_with_planning=problem_with_planning, 247 | language=self.language, 248 | ) 249 | }, 250 | ] 251 | 252 | if self.verbose >= VERBOSE_FULL: 253 | print("\n\n" + "_" * 70) 254 | print(f"Input for Simulation: {plan_no}\n\n") 255 | print(input_for_simulation[0]['content'], flush=True) 256 | 257 | response = self.gpt_chat( 258 | processed_input=input_for_simulation 259 | ) 260 | 261 | if self.verbose >= VERBOSE_FULL: 262 | print("\n\n" + "_" * 70) 263 | print(f"Response from Simulation: {plan_no}\n\n") 264 | print(response, flush=True) 265 | 266 | if "Plan Modification Needed" in response and \ 267 | "No Plan Modification Needed" not in response: 268 | if self.verbose >= VERBOSE_FULL: 269 | print("\n\n" + "_" * 70) 270 | print(f"**Plan Modification Needed.**\n") 271 | 272 | # Plan Refinement Phase 273 | input_for_plan_refinement = [ 274 | { 275 | "role": "user", 276 | "content": prompt_for_plan_refinement.format( 277 | problem_with_planning=problem_with_planning, 278 | language=self.language, 279 | critique=response 280 | ) 281 | }, 282 | ] 283 | 284 | if self.verbose >= VERBOSE_FULL: 285 | print("\n\n" + "_" * 70) 286 | print(f"Input for Plan Refinement: {plan_no}\n\n") 287 | print(input_for_plan_refinement[0]['content'], flush=True) 288 | 289 | plan = self.gpt_chat( 290 | processed_input=input_for_simulation 291 | ) 292 | 293 | if self.verbose >= VERBOSE_FULL: 294 | print("\n\n" + "_" * 70) 295 | print(f"Response from Plan Refinement: {plan_no}\n\n") 296 | print(plan, flush=True) 297 | 298 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 299 | 300 | # Code generation 301 | input_for_final_code_generation = [ 302 | { 303 | "role": "user", 304 | "content": prompt_for_code_generation.format( 305 | problem_with_planning=problem_with_planning, 306 | language=self.language, 307 | std_input_prompt=std_input_prompt, 308 | ) 309 | } 310 | ] 311 | 312 | if self.verbose >= VERBOSE_FULL: 313 | print("\n\n" + "_" * 70) 314 | print(f"Input for final code generation:\n\n") 315 | print(input_for_final_code_generation[0]['content'], flush=True) 316 | 317 | response = self.gpt_chat( 318 | input_for_final_code_generation 319 | ) 320 | 321 | if self.verbose >= VERBOSE_FULL: 322 | print("\n\n" + "_" * 70) 323 | print(f"Response from final code generation:\n\n") 324 | print(response, flush=True) 325 | 326 | code = parse_response(response) 327 | 328 | passed, test_log = self.check(data_row, additional_io, code) 329 | 330 | # Do not need to go for debugging steps 331 | if passed: 332 | break 333 | 334 | # problem_with_solution = f"{problem_with_planning}\n\n### Code:\n\n```{self.language}\n{code}\n```" 335 | 336 | # Debugging 337 | for debug_no in range(1, self.max_debug_try + 1): 338 | 339 | input_for_debugging = [ 340 | { 341 | "role": "user", 342 | "content": prompt_for_debugging.format( 343 | problem_with_planning=problem_with_planning, 344 | code=code, 345 | language=self.language, 346 | test_log=test_log, 347 | std_input_prompt=std_input_prompt, 348 | ) 349 | } 350 | ] 351 | 352 | if self.verbose >= VERBOSE_FULL: 353 | print("\n\n" + "_" * 70) 354 | print(f"Input for Improving code: {plan_no}, {debug_no}\n\n") 355 | print(input_for_debugging[0]['content'], flush=True) 356 | 357 | response = self.gpt_chat(input_for_debugging) 358 | 359 | if self.verbose >= VERBOSE_FULL: 360 | print("\n\n" + "_" * 70) 361 | print(f"Response from Improving code: {plan_no}, {debug_no}\n\n") 362 | print(response, flush=True) 363 | 364 | code = parse_response(response) 365 | 366 | passed, test_log = self.check(data_row, additional_io, code) 367 | 368 | # Passed so breaking this debugging loop 369 | if passed: 370 | break 371 | 372 | if passed: 373 | break 374 | 375 | if self.verbose >= VERBOSE_FULL: 376 | print("\n\n" + "_" * 70) 377 | 378 | return code 379 | 380 | 381 | prompt_for_planning = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 382 | 383 | ## Problem 384 | 385 | {problem} 386 | 387 | **Expected Output:** 388 | 389 | Your response must be structured as follows: 390 | 391 | ### Problem Understanding 392 | 393 | - Think about the original problem. Develop an initial understanding about the problem. 394 | 395 | ### Recall Example Problem 396 | 397 | Recall a relevant and distinct problems (different from problem mentioned above) and 398 | - Describe it 399 | - Generate {language} code step by step to solve that problem 400 | - Discuss the algorithm to solve this problem 401 | - Finally generate a planning to solve that problem 402 | 403 | ### Algorithm to solve the original problem 404 | 405 | - Write down the algorithm that is well suited for the original problem 406 | - Give some tutorials to about the algorithm for example: 407 | - How to approach this type of algorithm 408 | - Important things to consider 409 | 410 | ### Plan 411 | 412 | - Write down a detailed, step-by-step plan to solve the **original problem**. 413 | 414 | -------- 415 | **Important Instruction:** 416 | - Strictly follow the instructions. 417 | - Do not generate code. 418 | """ 419 | 420 | 421 | prompt_for_simulation = """You are a programmer tasked with verifying a plan to solve a given problem using the **{language}** programming language. 422 | 423 | {problem_with_planning} 424 | 425 | **Expected Output:** 426 | 427 | Your response must be structured as follows: 428 | 429 | ### Simulation 430 | 431 | - Take a sample input and apply plan step by step to get the output. Do not generate code do it manually by applying reasoning. 432 | - Compare the generated output with the sample output to verify if your plan works as expected. 433 | 434 | ### Plan Evaluation 435 | 436 | - If the simulation is successful write **No Need to Modify Plan**. 437 | - Otherwise write **Plan Modification Needed**. 438 | 439 | """ 440 | 441 | 442 | prompt_for_plan_refinement = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. You already have a wrong plan. Correct it so that it can generate correct plan. 443 | 444 | {problem_with_planning} 445 | 446 | ## Plan Critique 447 | 448 | {critique} 449 | 450 | **Expected Output:** 451 | 452 | Your response must be structured as follows: 453 | 454 | ## New Plan 455 | 456 | - Write down a detailed, step-by-step modified plan to solve the **original problem**. 457 | - Ensure each step logically follows from the previous one. 458 | 459 | -------- 460 | **Important Instruction:** 461 | - Your response must contain only the plan. 462 | - Do not add any explanation. 463 | - Do not generate code. 464 | """ 465 | 466 | 467 | 468 | prompt_for_code_generation = """You are a programmer tasked with solving a given problem using the **{language}** programming language. See the plan to solve the plan and implement code to solve it. 469 | 470 | {problem_with_planning} 471 | 472 | -------- 473 | **Important Instructions:** 474 | - Do not add any explanation. 475 | - The generated **{language}** code must be inside a triple backtick (```) code block. 476 | {std_input_prompt}""" 477 | 478 | 479 | prompt_for_debugging = """You are a programmer who has received a solution of a problem written in **{language}** that fails to pass certain test cases. Your task is to modify the code in such a way so that it can pass all the test cases. Do not generate same code. 480 | 481 | {problem_with_planning} 482 | 483 | ### Buggy Code 484 | ```{language} 485 | {code} 486 | ``` 487 | 488 | {test_log} 489 | 490 | **Expected Output:** 491 | 492 | Your response must be structured as follows: 493 | 494 | ### Simulation with failed test case 495 | To detect where is the bug follow following steps: 496 | - Take a sample test case where it fails. 497 | - Take the input go through each step according to the plan 498 | - You will get a output that must be different from the expected output. 499 | 500 | ### Debugging Notes 501 | - Based on this simulation detect any of the following cases: 502 | - Plan is wrong 503 | - Plan is correct but plan to code generation is wrong. 504 | - Finally, discuss how to correct this code. 505 | 506 | ### Modified Code 507 | 508 | ```{language} 509 | # Your corrected code, with comments explaining each correction. 510 | ``` 511 | 512 | -------- 513 | **Important Instructions:** 514 | - Strictly follow the instructions. 515 | - Do not add testing code for example assert statement in your code. 516 | - Do not be overconfident that the generated code is correct. It is wrong. 517 | - The modified **{language}** code must be enclosed within triple backticks (```). 518 | - Your response must contain **Simulation with failed test case**, **Debugging Notes**, 519 | and **Modified Code** section. 520 | {std_input_prompt}""" 521 | 522 | -------------------------------------------------------------------------------- /src/promptings/Direct.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | from copy import deepcopy 5 | 6 | from .Base import BaseStrategy 7 | from models.Base import BaseModel 8 | from datasets.Dataset import Dataset 9 | from results.Results import Results 10 | 11 | 12 | class DirectStrategy(BaseStrategy): 13 | def run_single_pass(self, data_row: dict): 14 | processed_input = [ 15 | { 16 | "role": "user", 17 | "content": f'{self.data.get_prompt(data_row)}\n\nGenerate {self.language} code to solve the above mentioned problem:', 18 | }, 19 | ] 20 | return self.gpt_chat(processed_input=processed_input) 21 | -------------------------------------------------------------------------------- /src/promptings/MapCoder.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import json 5 | import re 6 | import sys 7 | import time 8 | 9 | from copy import deepcopy 10 | import xml.etree.ElementTree as ET 11 | 12 | from .Base import BaseStrategy 13 | from models.Base import BaseModel 14 | 15 | from datasets.Dataset import Dataset 16 | from datasets.APPSDataset import APPSDataset 17 | from datasets.MBPPDataset import MBPPDataset 18 | from datasets.XCodeDataset import XCodeDataset 19 | from datasets.HumanEvalDataset import HumanDataset 20 | from datasets.CodeContestDataset import CodeContestDataset 21 | 22 | from results.Results import Results 23 | from evaluations.func_evaluate import evaluate_io 24 | from constants.verboseType import * 25 | 26 | mapping = { 27 | 1: "one (01)", 28 | 2: "two (02)", 29 | 3: "three (03)", 30 | 4: "four (04)", 31 | 5: "five (05)", 32 | 6: "six (06)", 33 | 7: "seven (07)", 34 | 8: "eight (08)", 35 | 9: "nine (09)", 36 | } 37 | 38 | # KB + Exemplars + Example Planning + Problem Planning + Code Generation + Sample IO testing + Code Improvement 39 | 40 | 41 | class MapCoder(BaseStrategy): 42 | def __init__( 43 | self, 44 | k: int = 3, 45 | t: int = 5, 46 | *args, 47 | **kwargs 48 | ): 49 | super().__init__(*args, **kwargs) 50 | self.k = k 51 | self.t = t 52 | 53 | def xml_to_dict(self, element): 54 | result = {} 55 | for child in element: 56 | if child: 57 | child_data = self.xml_to_dict(child) 58 | if child.tag in result: 59 | if isinstance(result[child.tag], list): 60 | result[child.tag].append(child_data) 61 | else: 62 | result[child.tag] = [result[child.tag], child_data] 63 | else: 64 | result[child.tag] = child_data 65 | else: 66 | result[child.tag] = child.text 67 | return result 68 | 69 | def parse_xml(self, response: str) -> dict: 70 | if '```xml' in response: 71 | response = response.replace('```xml', '') 72 | if '```' in response: 73 | response = response.replace('```', '') 74 | 75 | try: 76 | root = ET.fromstring(response) 77 | except: 78 | try: 79 | root = ET.fromstring('\n' + response + '\n') 80 | except: 81 | root = ET.fromstring('\n' + response) 82 | return self.xml_to_dict(root) 83 | 84 | def parse_code(self, response: str) -> str: 85 | if "```" not in response: 86 | return response 87 | 88 | code_pattern = r'```((.|\n)*?)```' 89 | if "```Python" in response: 90 | code_pattern = r'```Python((.|\n)*?)```' 91 | if "```Python3" in response: 92 | code_pattern = r'```Python3((.|\n)*?)```' 93 | if "```python" in response: 94 | code_pattern = r'```python((.|\n)*?)```' 95 | if "```python3" in response: 96 | code_pattern = r'```python3((.|\n)*?)```' 97 | if "```C" in response: 98 | code_pattern = r'```C((.|\n)*?)```' 99 | if "```c" in response: 100 | code_pattern = r'```c((.|\n)*?)```' 101 | if "```C++" in response: 102 | code_pattern = r'```C\+\+((.|\n)*?)```' 103 | if "```c++" in response: 104 | code_pattern = r'```c\+\+((.|\n)*?)```' 105 | if "```Java" in response: 106 | code_pattern = r'```Java((.|\n)*?)```' 107 | if "```java" in response: 108 | code_pattern = r'```java((.|\n)*?)```' 109 | if "```Node" in response: 110 | code_pattern = r'```Node((.|\n)*?)```' 111 | if "```node" in response: 112 | code_pattern = r'```node((.|\n)*?)```' 113 | if "```Rust" in response: 114 | code_pattern = r'```Rust((.|\n)*?)```' 115 | if "```rust" in response: 116 | code_pattern = r'```rust((.|\n)*?)```' 117 | if "```PHP" in response: 118 | code_pattern = r'```PHP((.|\n)*?)```' 119 | if "```php" in response: 120 | code_pattern = r'```php((.|\n)*?)```' 121 | if "```Go" in response: 122 | code_pattern = r'```Go((.|\n)*?)```' 123 | if "```go" in response: 124 | code_pattern = r'```go((.|\n)*?)```' 125 | if "```Ruby" in response: 126 | code_pattern = r'```Ruby((.|\n)*?)```' 127 | if "```ruby" in response: 128 | code_pattern = r'```ruby((.|\n)*?)```' 129 | if "```C#" in response: 130 | code_pattern = r'```C#((.|\n)*?)```' 131 | if "```c#" in response: 132 | code_pattern = r'```c#((.|\n)*?)```' 133 | if "```csharp" in response: 134 | code_pattern = r'```csharp((.|\n)*?)```' 135 | 136 | code_blocks = re.findall(code_pattern, response, re.DOTALL) 137 | 138 | if type(code_blocks[-1]) == tuple or type(code_blocks[-1]) == list: 139 | code_str = "\n".join(code_blocks[-1]) 140 | elif type(code_blocks[-1]) == str: 141 | code_str = code_blocks[-1] 142 | else: 143 | code_str = response 144 | 145 | return code_str 146 | 147 | @staticmethod 148 | def trim_text(text: str, trimmed_text: str): 149 | return text.replace(trimmed_text, '').strip() 150 | 151 | @staticmethod 152 | def replace_tag(text: str, tag: str): 153 | if f'<{tag}>' in text: 154 | return text 155 | else: 156 | return text.replace(f'<{tag}>', f'<{tag}>', f']]>').strip() 157 | 158 | @staticmethod 159 | def get_sample_io_str(sample_io: any) -> str: 160 | if len(sample_io) > 0: 161 | if type(sample_io[0]) == str: 162 | return "\n".join(sample_io) 163 | if type(sample_io[0]) == dict: 164 | return "\n".join([f"Input:\n{io['input']}\nExpected output:\n{io['output'][0]}" for io in sample_io]) 165 | return sample_io 166 | 167 | def run_single_pass(self, data_row: dict): 168 | if self.verbose >= VERBOSE_FULL: 169 | print("", flush=True) 170 | 171 | input_kb_exemplars = [ 172 | { 173 | "role": "user", 174 | "content": f"""Given a problem, provide relevant problems then identify the algorithm behind it and also explain the tutorial of the algorithm. 175 | # Problem: 176 | {self.data.get_prompt(data_row)} 177 | 178 | # Exemplars: 179 | Recall {mapping[self.k]} relevant and distinct problems (different from problem mentioned above). For each problem, 180 | 1. describe it 181 | 2. generate {self.language} code step by step to solve that problem 182 | 3. finally generate a planning to solve that problem 183 | 184 | # Algorithm: 185 | 186 | ---------------- 187 | Important: 188 | Your response must follow the following xml format- 189 | 190 | 191 | 192 | # Recall {mapping[self.k]} relevant and distinct problems (different from problem mentioned above). Write each problem in the following format. 193 | 194 | # Describe the problem. 195 | 196 | 197 | # Let's think step by step to solve this problem in {self.language} programming language. 198 | 199 | 200 | # Planning to solve this problem. 201 | 202 | 203 | 204 | # similarly add more problems here... 205 | 206 | 207 | # Identify the algorithm (Brute-force, Dynamic Programming, Divide-and-conquer, Greedy, Backtracking, Recursive, Binary search, and so on) that needs to be used to solve the original problem. 208 | # Write a useful tutorial about the above mentioned algorithms. Provide a high level generic tutorial for solving this types of problem. Do not generate code. 209 | 210 | 211 | """, 212 | }, 213 | ] 214 | 215 | if self.verbose >= VERBOSE_FULL: 216 | print("\n\n________________________") 217 | print("Input for knowledge base and exemplars: ") 218 | print(input_kb_exemplars[0]['content'], flush=True) 219 | 220 | response = self.gpt_chat( 221 | processed_input=input_kb_exemplars 222 | ) 223 | 224 | # Post processing 225 | response = self.trim_text( 226 | response, "# Identify the algorithm (Brute-force, Dynamic Programming, Divide-and-conquer, Greedy, Backtracking, Recursive, Binary search, and so on) that needs to be used to solve the original problem.") 227 | response = self.trim_text( 228 | response, "# Write a useful tutorial about the above mentioned algorithms. Provide a high level generic tutorial for solving this types of problem. Do not generate code.") 229 | response = self.trim_text( 230 | response, "# Planning to solve this problem:") 231 | response = self.trim_text( 232 | response, f"# Let's think step by step to solve this problem in {self.language} programming language.") 233 | response = self.replace_tag(response, 'algorithm') 234 | response = self.replace_tag(response, 'description') 235 | response = self.replace_tag(response, 'code') 236 | response = self.replace_tag(response, 'planning') 237 | 238 | if self.verbose >= VERBOSE_FULL: 239 | print("\n\n________________________") 240 | print("Response from knowledge base and exemplars: ") 241 | print(response, flush=True) 242 | 243 | response = self.parse_xml(response) 244 | 245 | algorithm_prompt = f"## Relevant Algorithm to solve the next problem:\n{ response['algorithm']}" 246 | sample_io_prompt = f"## Sample Test cases: \n{self.get_sample_io_str(data_row['sample_io'])}\n" 247 | # if type(self.data) != MBPPDataset and type(self.data) != XCodeDataset else "" 248 | 249 | plannings = [] 250 | for example_no, example in enumerate(response["problem"], start=1): 251 | example_problem = example["description"] 252 | example_planning = example["planning"] 253 | 254 | input_for_problem_planning = [ 255 | { 256 | "role": "user", 257 | "content": f"Given a competitive programming problem generate a concrete planning to solve the problem.\n# Problem:\n{example_problem}\n# Planning:\n{example_planning}\n{algorithm_prompt}\n## Problem to be solved:\n{self.data.get_prompt(data_row)}\n{sample_io_prompt}\n## Planning:\n\n----------------\nImportant: You should give only the planning to solve the problem. Do not add extra explanation or words." 258 | } 259 | ] 260 | 261 | if self.verbose >= VERBOSE_FULL: 262 | print("\n\n________________________") 263 | print( 264 | f"Input for our problem planning using example: {example_no}: ") 265 | print(input_for_problem_planning[0]['content'], flush=True) 266 | 267 | planning = self.gpt_chat( 268 | input_for_problem_planning 269 | ) 270 | 271 | # planning = self.parse_xml(planning) 272 | # planning['confidence'] = int(str(planning['confidence']).strip()) 273 | 274 | if self.verbose >= VERBOSE_FULL: 275 | print("\n\n________________________") 276 | print("Response from our problem planning: ") 277 | print(planning, flush=True) 278 | 279 | input_for_planning_verification = [ 280 | { 281 | "role": "user", 282 | "content": f"Given a competitive programming problem and a plan to solve the problem in {self.language}, tell whether the plan is correct to solve this problem.\n\n# Problem:\n{self.data.get_prompt(data_row)}\n# Planning:\n{planning}\n\n----------------\nImportant: Your response must follow the following xml format-```\n\n Discuss whether the given competitive programming problem is solvable by using the above mentioned planning.\n Confidence score regarding the solvability of the problem. Must be an integer between 0 and 100. \n\n```" 283 | } 284 | ] 285 | 286 | if self.verbose >= VERBOSE_FULL: 287 | print("Input for planning verification: ") 288 | print(input_for_planning_verification[0]['content'], flush=True) 289 | 290 | verification_res = self.gpt_chat( 291 | input_for_planning_verification 292 | ) 293 | 294 | verification_res = self.replace_tag( 295 | verification_res, 'explanation') 296 | verification_res = self.replace_tag(verification_res, 'confidence') 297 | 298 | verification_res = self.parse_xml(verification_res) 299 | 300 | verification_res['confidence'] = int( 301 | str(verification_res['confidence']).strip()) 302 | 303 | if self.verbose >= VERBOSE_FULL: 304 | print("Response from planning verification: ") 305 | print(verification_res, flush=True) 306 | 307 | plannings.append(( 308 | planning, 309 | verification_res['confidence'], 310 | example 311 | )) 312 | 313 | # if type(self.data) == MBPPDataset and verification_res['confidence'] == 100: 314 | # break 315 | 316 | plannings.sort(key=lambda x: x[1], reverse=True) 317 | # time.sleep(1) 318 | 319 | if type(self.data) == APPSDataset or type(self.data) == CodeContestDataset or type(self.data) == XCodeDataset: 320 | std_input_prompt = "## Note: Strictly follow the input and output format. The input should be taken from Standard input and output should be given to standard output. If you are writing a function then after the function definition take input using `input()` function then call the function with specified parameters and finally print the output of the function. Do not add extra print statement otherwise it will failed the test cases." 321 | else: 322 | std_input_prompt = "" 323 | 324 | for planning_with_ex in plannings: 325 | planning, confidence, example = planning_with_ex 326 | 327 | input_for_final_code_generation = [ 328 | { 329 | "role": "user", 330 | "content": f"Given a competitive programming problem generate {self.language} code to solve the problem.\n{algorithm_prompt}\n## Problem to be solved:\n{self.data.get_prompt(data_row)}\n## Planning:\n{planning}\n{sample_io_prompt}\n## Let's think step by step.\n\n----------------\nImportant:\n{std_input_prompt}\n## Your response must contain only the {self.language} code to solve this problem. Do not add extra explanation or words." 331 | } 332 | ] 333 | 334 | if self.verbose >= VERBOSE_FULL: 335 | print("\n\n________________________") 336 | print("Input for final code generation: ") 337 | print(input_for_final_code_generation[0]['content'], flush=True) 338 | 339 | code = self.gpt_chat( 340 | input_for_final_code_generation 341 | ) 342 | 343 | code = self.parse_code(code) 344 | 345 | if self.verbose >= VERBOSE_FULL: 346 | print("\n\n________________________") 347 | print("Response from final code generation: ") 348 | print(code, flush=True) 349 | 350 | response = f"## Planning: {planning}\n## Code:\n```\n{code}\n```" 351 | passed = False 352 | 353 | for i in range(1, self.t + 1): 354 | passed, test_log = self.data.evaluate_sample_io( 355 | data_row, 356 | code, 357 | self.language 358 | ) 359 | 360 | if passed: 361 | break 362 | 363 | if self.verbose >= VERBOSE_FULL: 364 | print(f"Input for improving code generation: {i}") 365 | input_for_improving_code = [ 366 | { 367 | "role": "user", 368 | "content": f"Given a competitive programming problem you have generated {self.language} code to solve the problem. But the generated code can not pass sample test cases. Improve your code to solve the problem correctly.\n{algorithm_prompt}\n## Problem to be solved:\n{self.data.get_prompt(data_row)}\n{response}\n## Test Report:\n{test_log}\n## Modified Planning:\n## Let's think step by step to modify {self.language} Code for solving this problem.\n\n----------------\nImportant:\n{std_input_prompt}\n## Your response must contain the modified planning and then the {self.language} code inside ``` block to solve this problem." 369 | } 370 | ] 371 | 372 | if self.verbose >= VERBOSE_FULL: 373 | print("\n\n________________________") 374 | print("Input for improving code generation: ") 375 | print(input_for_improving_code[0]['content'], flush=True) 376 | 377 | response = self.gpt_chat( 378 | input_for_improving_code 379 | ) 380 | code = self.parse_code(response) 381 | 382 | if self.verbose >= VERBOSE_FULL: 383 | print("\n\n________________________") 384 | print("Response from improving code generation: ") 385 | print(response, flush=True) 386 | 387 | # got a code that passed all sample test cases 388 | if passed: 389 | break 390 | 391 | if self.verbose >= VERBOSE_FULL: 392 | print("________________________\n\n", flush=True) 393 | 394 | return code 395 | -------------------------------------------------------------------------------- /src/promptings/PromptingFactory.py: -------------------------------------------------------------------------------- 1 | from promptings.CoT import CoTStrategy 2 | from promptings.Direct import DirectStrategy 3 | from promptings.Analogical import AnalogicalStrategy 4 | from promptings.SelfPlanning import SelfPlanningStrategy 5 | from promptings.MapCoder import MapCoder 6 | 7 | from promptings.CodeSIM import CodeSIM 8 | from promptings.variations.CodeSIMA import CodeSIMA 9 | from promptings.variations.CodeSIMC import CodeSIMC 10 | from promptings.variations.CodeSIMWD import CodeSIMWD 11 | from promptings.variations.CodeSIMWPV import CodeSIMWPV 12 | from promptings.variations.CodeSIMWPVD import CodeSIMWPVD 13 | 14 | class PromptingFactory: 15 | @staticmethod 16 | def get_prompting_class(prompting_name): 17 | if prompting_name == "CoT": 18 | return CoTStrategy 19 | elif prompting_name == "MapCoder": 20 | return MapCoder 21 | elif prompting_name == "Direct": 22 | return DirectStrategy 23 | elif prompting_name == "Analogical": 24 | return AnalogicalStrategy 25 | elif prompting_name == "SelfPlanning": 26 | return SelfPlanningStrategy 27 | elif prompting_name == "CodeSIM": 28 | return CodeSIM 29 | elif prompting_name == "CodeSIMA": 30 | return CodeSIMA 31 | elif prompting_name == "CodeSIMC": 32 | return CodeSIMC 33 | elif prompting_name == "CodeSIMWD": 34 | return CodeSIMWD 35 | elif prompting_name == "CodeSIMWPV": 36 | return CodeSIMWPV 37 | elif prompting_name == "CodeSIMWPVD": 38 | return CodeSIMWPVD 39 | else: 40 | raise Exception(f"Unknown prompting name {prompting_name}") 41 | -------------------------------------------------------------------------------- /src/promptings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/src/promptings/__init__.py -------------------------------------------------------------------------------- /src/promptings/variations/CodeSIMA.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import json 5 | import re 6 | import sys 7 | import time 8 | 9 | from copy import deepcopy 10 | import xml.etree.ElementTree as ET 11 | 12 | from ..Base import BaseStrategy 13 | from ..Direct import DirectStrategy 14 | from models.Base import BaseModel 15 | 16 | from datasets.Dataset import Dataset 17 | from datasets.APPSDataset import APPSDataset 18 | from datasets.MBPPDataset import MBPPDataset 19 | from datasets.XCodeDataset import XCodeDataset 20 | from datasets.HumanEvalDataset import HumanDataset 21 | from datasets.CodeContestDataset import CodeContestDataset 22 | 23 | from evaluations.func_evaluate import evaluate_io 24 | 25 | from utils.parse import parse_response 26 | from constants.verboseType import * 27 | 28 | class CodeSIMA(DirectStrategy): 29 | def __init__( 30 | self, 31 | additional_info_run=2, 32 | max_plan_try=5, 33 | max_debug_try=5, 34 | *args, 35 | **kwargs 36 | ): 37 | super().__init__(*args, **kwargs) 38 | 39 | 40 | self.additional_info_run=additional_info_run 41 | self.max_plan_try=max_plan_try 42 | self.max_debug_try=max_debug_try 43 | 44 | self.is_competative = type(self.data) == APPSDataset or \ 45 | type(self.data) == CodeContestDataset or \ 46 | type(self.data) == XCodeDataset 47 | 48 | # Cost reduction for competative programming 49 | if self.is_competative: 50 | self.max_plan_try = 3 51 | self.max_debug_try = 3 52 | 53 | 54 | if self.verbose >= VERBOSE_FULL: 55 | print("\n\n" + "_" * 70) 56 | print(f"Running CodeSIM with additional_info_run={additional_info_run}, max_plan_try={self.max_plan_try}, max_debug_try={self.max_debug_try}") 57 | print("\n", flush=True) 58 | 59 | 60 | @staticmethod 61 | def get_sample_io_str(sample_io: any) -> str: 62 | if len(sample_io) > 0: 63 | if type(sample_io[0]) == str: 64 | return "\n".join(sample_io) 65 | if type(sample_io[0]) == dict: 66 | return "\n".join([f"Input:\n{io['input']}\nExpected output:\n{io['output'][0]}" for io in sample_io]) 67 | return sample_io 68 | 69 | 70 | @staticmethod 71 | def process_test_log(test_logs: str): 72 | passed_test_cases = [] 73 | failed_test_cases = [] 74 | for test_log in test_logs.splitlines(): 75 | if test_log.startswith("Passed"): 76 | passed_test_cases.append(test_log[test_log.index("assert"):]) 77 | if test_log.startswith("Failed"): 78 | failed_test_cases.append(test_log[test_log.index("assert"):]) 79 | 80 | failed_test_cases_str = "\n".join(failed_test_cases) 81 | return f"### Test Cases where the generated code failed to generate the expected output:\n{failed_test_cases_str}" 82 | 83 | 84 | 85 | def parse_test_cases(self, test_cases: str): 86 | return [ 87 | test_case 88 | for test_case in test_cases.splitlines() 89 | if len(test_case) > 0 and test_case.startswith("assert") 90 | ] 91 | 92 | 93 | def check( 94 | self, 95 | data_row: dict, 96 | additional_io: List[str], 97 | code: str 98 | ) -> bool: 99 | passed_sample, test_log_sample = self.data.evaluate_sample_io( 100 | data_row, 101 | code, 102 | self.language 103 | ) 104 | 105 | passed_additional, test_log_additional = self.data.evaluate_additional_io( 106 | data_row[self.data.id_key], 107 | additional_io, 108 | code, 109 | self.language 110 | ) 111 | 112 | if self.is_competative: 113 | test_log_sample = test_log_sample[test_log_sample.find("## Tests failed:"):] 114 | test_log = test_log_sample + test_log_additional 115 | else: 116 | test_log = self.process_test_log(test_log_sample + test_log_additional) 117 | 118 | return passed_sample & passed_additional, test_log 119 | 120 | 121 | def run_single_pass(self, data_row: dict): 122 | print("", flush=True) 123 | 124 | problem = self.data.get_prompt(data_row) 125 | 126 | std_input_prompt = "" 127 | 128 | if self.is_competative: 129 | std_input_prompt = "- Strictly follow the input and output format. The input should be taken from Standard input and output should be given to standard output. If you are writing a function then after the function definition take input using `input()` function then call the function with specified parameters and finally print the output of the function. Do not add extra print statement otherwise it will failed the test cases." 130 | 131 | problem = problem[:problem.find("-------\nImportant Note:")] 132 | 133 | additional_io = None 134 | 135 | # Additional IO collection 136 | for idx in range(1, self.additional_info_run + 1): 137 | # Additional IO 138 | additional_io_generation_input = [ 139 | { 140 | "role": "user", 141 | "content": prompt_for_additional_io.format( 142 | problem=problem, 143 | problem_name=data_row["entry_point"], 144 | ), 145 | }, 146 | ] 147 | 148 | if self.verbose >= VERBOSE_FULL: 149 | print("\n\n" + "_" * 70) 150 | print(f"Input for Additional IO Generation: {idx}\n\n") 151 | print(additional_io_generation_input[0]['content'], flush=True) 152 | 153 | response = self.gpt_chat( 154 | processed_input=additional_io_generation_input, 155 | frequency_penalty=0.2 156 | ) 157 | 158 | if self.verbose >= VERBOSE_FULL: 159 | print("\n\n" + "_" * 70) 160 | print(f"Response from Additional IO Generation: {idx}\n\n") 161 | print(response, flush=True) 162 | 163 | additional_io_response = response 164 | 165 | # Applying intersection for self-consistancy 166 | if additional_io is None: 167 | additional_io = set(self.parse_test_cases( 168 | test_cases=additional_io_response 169 | )) 170 | else: 171 | additional_io_ = self.parse_test_cases( 172 | test_cases=additional_io_response 173 | ) 174 | additional_io = additional_io.intersection(set(additional_io_)) 175 | 176 | additional_io = list(additional_io) 177 | if self.verbose >= VERBOSE_FULL: 178 | print(f"Additional IOs:") 179 | print(additional_io, flush=True) 180 | 181 | self.run_details["additional_io"] = additional_io 182 | 183 | 184 | # Planning, Coding, Debugging 185 | for plan_no in range(1, self.max_plan_try + 1): 186 | # Planning Phase 187 | 188 | # if self.is_competative: 189 | input_for_planning = [ 190 | { 191 | "role": "user", 192 | "content": prompt_for_planning_competative.format( 193 | problem=problem, 194 | language=self.language, 195 | ) 196 | }, 197 | ] 198 | # else: 199 | # input_for_planning = [ 200 | # { 201 | # "role": "user", 202 | # "content": prompt_for_planning.format( 203 | # problem=problem, 204 | # language=self.language, 205 | # ) 206 | # }, 207 | # ] 208 | 209 | if self.verbose >= VERBOSE_FULL: 210 | print("\n\n" + "_" * 70) 211 | print(f"Input for Planning: {plan_no}\n\n") 212 | print(input_for_planning[0]['content'], flush=True) 213 | 214 | response = self.gpt_chat( 215 | processed_input=input_for_planning 216 | ) 217 | 218 | if self.verbose >= VERBOSE_FULL: 219 | print("\n\n" + "_" * 70) 220 | print(f"Response from Planning: {plan_no}\n\n") 221 | print(response, flush=True) 222 | 223 | # if "```" in response: 224 | # plan = parse_response(response) 225 | # else: 226 | # plan = response[response.find("### Plan"):] 227 | 228 | if "### Plan" not in response: 229 | plan = f"### Plan\n\n{response}" 230 | else: 231 | plan = response[response.rfind("### Plan"):] 232 | 233 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 234 | 235 | # Simulation Phase 236 | input_for_simulation = [ 237 | { 238 | "role": "user", 239 | "content": prompt_for_simulation.format( 240 | problem_with_planning=problem_with_planning, 241 | language=self.language, 242 | ) 243 | }, 244 | ] 245 | 246 | if self.verbose >= VERBOSE_FULL: 247 | print("\n\n" + "_" * 70) 248 | print(f"Input for Simulation: {plan_no}\n\n") 249 | print(input_for_simulation[0]['content'], flush=True) 250 | 251 | response = self.gpt_chat( 252 | processed_input=input_for_simulation 253 | ) 254 | 255 | if self.verbose >= VERBOSE_FULL: 256 | print("\n\n" + "_" * 70) 257 | print(f"Response from Simulation: {plan_no}\n\n") 258 | print(response, flush=True) 259 | 260 | if "Plan Modification Needed" in response and \ 261 | "No Plan Modification Needed" not in response: 262 | if self.verbose >= VERBOSE_FULL: 263 | print("\n\n" + "_" * 70) 264 | print(f"**Plan Modification Needed.**\n") 265 | 266 | # Plan Refinement Phase 267 | input_for_plan_refinement = [ 268 | { 269 | "role": "user", 270 | "content": prompt_for_plan_refinement.format( 271 | problem_with_planning=problem_with_planning, 272 | language=self.language, 273 | critique=response 274 | ) 275 | }, 276 | ] 277 | 278 | if self.verbose >= VERBOSE_FULL: 279 | print("\n\n" + "_" * 70) 280 | print(f"Input for Plan Refinement: {plan_no}\n\n") 281 | print(input_for_plan_refinement[0]['content'], flush=True) 282 | 283 | plan = self.gpt_chat( 284 | processed_input=input_for_simulation 285 | ) 286 | 287 | if self.verbose >= VERBOSE_FULL: 288 | print("\n\n" + "_" * 70) 289 | print(f"Response from Plan Refinement: {plan_no}\n\n") 290 | print(plan, flush=True) 291 | 292 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 293 | 294 | # Code generation 295 | input_for_final_code_generation = [ 296 | { 297 | "role": "user", 298 | "content": prompt_for_code_generation.format( 299 | problem_with_planning=problem_with_planning, 300 | language=self.language, 301 | std_input_prompt=std_input_prompt, 302 | ) 303 | } 304 | ] 305 | 306 | if self.verbose >= VERBOSE_FULL: 307 | print("\n\n" + "_" * 70) 308 | print(f"Input for final code generation:\n\n") 309 | print(input_for_final_code_generation[0]['content'], flush=True) 310 | 311 | response = self.gpt_chat( 312 | input_for_final_code_generation 313 | ) 314 | 315 | if self.verbose >= VERBOSE_FULL: 316 | print("\n\n" + "_" * 70) 317 | print(f"Response from final code generation:\n\n") 318 | print(response, flush=True) 319 | 320 | code = parse_response(response) 321 | 322 | passed, test_log = self.check(data_row, additional_io, code) 323 | 324 | # Do not need to go for debugging steps 325 | if passed: 326 | break 327 | 328 | # problem_with_solution = f"{problem_with_planning}\n\n### Code:\n\n```{self.language}\n{code}\n```" 329 | 330 | # Debugging 331 | for debug_no in range(1, self.max_debug_try + 1): 332 | 333 | input_for_debugging = [ 334 | { 335 | "role": "user", 336 | "content": prompt_for_debugging.format( 337 | problem_with_planning=problem_with_planning, 338 | code=code, 339 | language=self.language, 340 | test_log=test_log, 341 | std_input_prompt=std_input_prompt, 342 | ) 343 | } 344 | ] 345 | 346 | if self.verbose >= VERBOSE_FULL: 347 | print("\n\n" + "_" * 70) 348 | print(f"Input for Improving code: {plan_no}, {debug_no}\n\n") 349 | print(input_for_debugging[0]['content'], flush=True) 350 | 351 | response = self.gpt_chat(input_for_debugging) 352 | 353 | if self.verbose >= VERBOSE_FULL: 354 | print("\n\n" + "_" * 70) 355 | print(f"Response from Improving code: {plan_no}, {debug_no}\n\n") 356 | print(response, flush=True) 357 | 358 | code = parse_response(response) 359 | 360 | passed, test_log = self.check(data_row, additional_io, code) 361 | 362 | # Passed so breaking this debugging loop 363 | if passed: 364 | break 365 | 366 | if passed: 367 | break 368 | 369 | if self.verbose >= VERBOSE_FULL: 370 | print("\n\n" + "_" * 70) 371 | 372 | return code 373 | 374 | 375 | prompt_for_additional_io = """You are a tester tasked with creating comprehensive unit test cases for a given programming problem. 376 | 377 | ## Problem 378 | 379 | def maximum_segments(n, a, b, c): 380 | ''' 381 | Write a Python function to find the maximum number of segments of lengths a, b, and c 382 | that can be formed from n. 383 | ''' 384 | 385 | ### Problem Understanding 386 | 387 | The task is to maximize the number of segments you can cut from a total length `n`, where the possible segment lengths are `a`, `b`, and `c`. Let say we have a rope of length `n` meter. We need to cut it into segments. Possible segment length is `a`, `b`, and `c`. There may be many possible way of doing these segments. We need to find out the maximum number of segments from that rope. 388 | 389 | ### Test Cases 390 | assert maximum_segments(7, 5, 2, 5) == 2 391 | assert maximum_segments(17, 2, 1, 3) == 17 392 | assert maximum_segments(18, 16, 3, 6) == 6 393 | assert maximum_segments(11, 8, 4, 9) == -1 394 | assert maximum_segments(5, 9, 6, 10) == -1 395 | 396 | --- 397 | 398 | ## Problem 399 | 400 | {problem} 401 | 402 | -------- 403 | **Important Instruction:** 404 | For the problem `{problem_name}` 405 | - First, understand the problem `{problem_name}` and write down the understanding inside **Problem Understanding** section. 406 | - Then Generate five (05) unit test cases that cover both: 407 | - **Normal** and **Edge** case scenarios 408 | - **Positive** and **Negative** case scenarios 409 | - **Valid** and **Invalid** case scenarios 410 | inside **Test Cases** section. 411 | - Write down each test case in a single line following the pattern shown in the example problem. 412 | - Do not generate any code to solve this problem. 413 | """ 414 | 415 | 416 | prompt_for_initial_code_generation = """{problem} 417 | 418 | -------- 419 | Important Instructions: 420 | - Generate {language} code step-by-step to solve the above mentioned problem. 421 | - Do not generate any explanation. 422 | - The generated **{language}** code must be enclosed within triple backticks (```). 423 | {std_input_prompt}""" 424 | 425 | 426 | prompt_for_code_validation = """You are a tester tasked with checking a code for a given problem. 427 | 428 | --- 429 | 430 | ## Problem 431 | 432 | {problem} 433 | 434 | ## Code 435 | 436 | {code} 437 | 438 | --- 439 | 440 | **Your output must follow the steps below:** 441 | - Try to generate a test case other than the sample test cases that are mentioned inside the problem. 442 | - Take a the input and apply the code step by step to get the output. 443 | - Compare the generated output with the expected output to verify if the generated code is ok or not. 444 | - Write **Buggy Code** if you find such a test case otherwise write **Code is ok**. 445 | """ 446 | 447 | 448 | prompt_for_planning = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 449 | 450 | ## Problem 451 | 452 | {problem} 453 | 454 | **Expected Output:** 455 | 456 | Your response must be structured as follows: 457 | 458 | ### Problem Understanding 459 | 460 | Think about the original problem. Develop an initial understanding about the problem. 461 | 462 | ### Recall Example Problem 463 | 464 | Recall a relevant and distinct problems (different from problem mentioned above) and 465 | - describe it 466 | - generate {language} code step by step to solve that problem 467 | - finally generate a planning to solve that problem 468 | 469 | ### Plan 470 | 471 | - Write down a detailed, step-by-step plan to solve the **original problem**. 472 | 473 | -------- 474 | **Important Instruction:** 475 | - Strictly follow the instructions. 476 | - Do not generate code. 477 | """ 478 | 479 | 480 | prompt_for_planning_competative = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 481 | 482 | ## Problem 483 | 484 | {problem} 485 | 486 | **Expected Output:** 487 | 488 | Your response must be structured as follows: 489 | 490 | ### Problem Understanding 491 | 492 | - Think about the original problem. Develop an initial understanding about the problem. 493 | 494 | ### Recall Example Problem 495 | 496 | Recall a relevant and distinct problems (different from problem mentioned above) and 497 | - Describe it 498 | - Generate {language} code step by step to solve that problem 499 | - Discuss the algorithm to solve this problem 500 | - Finally generate a planning to solve that problem 501 | 502 | ### Algorithm to solve the original problem 503 | 504 | - Write down the algorithm that is well suited for the original problem 505 | - Give some tutorials to about the algorithm for example: 506 | - How to approach this type of algorithm 507 | - Important things to consider 508 | 509 | ### Plan 510 | 511 | - Write down a detailed, step-by-step plan to solve the **original problem**. 512 | 513 | -------- 514 | **Important Instruction:** 515 | - Strictly follow the instructions. 516 | - Do not generate code. 517 | """ 518 | 519 | 520 | prompt_for_simulation = """You are a programmer tasked with verifying a plan to solve a given problem using the **{language}** programming language. 521 | 522 | {problem_with_planning} 523 | 524 | **Expected Output:** 525 | 526 | Your response must be structured as follows: 527 | 528 | ### Simulation 529 | 530 | - Take a sample input and apply plan step by step to get the output. 531 | - Compare the generated output with the sample output to verify if your plan works as expected. 532 | 533 | ### Plan Evaluation 534 | 535 | - If the simulation is successful write **No Need to Modify Plan**. 536 | - Otherwise write **Plan Modification Needed**. 537 | 538 | """ 539 | 540 | 541 | prompt_for_plan_refinement = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. You already have a wrong plan. Correct it so that it can generate correct code. 542 | 543 | {problem_with_planning} 544 | 545 | ## Plan Critique 546 | 547 | {critique} 548 | 549 | **Expected Output:** 550 | 551 | Your response must be structured as follows: 552 | 553 | ## New Plan 554 | 555 | - Write down a detailed, step-by-step modified plan to solve the **original problem**. 556 | - Ensure each step logically follows from the previous one. 557 | 558 | -------- 559 | **Important Instruction:** 560 | - Your response must contain only the plan. 561 | - Do not add any explanation. 562 | - Do not generate code. 563 | """ 564 | 565 | 566 | 567 | prompt_for_code_generation = """You are a programmer tasked with solving a given problem using the **{language}** programming language. See the plan to solve the plan and implement code to solve it. 568 | 569 | {problem_with_planning} 570 | 571 | -------- 572 | **Important Instructions:** 573 | - Do not add any explanation. 574 | - The generated **{language}** code must be inside a triple backtick (```) code block. 575 | {std_input_prompt}""" 576 | 577 | 578 | prompt_for_debugging = """You are a programmer who has received a solution of a problem written in **{language}** that fails to pass certain test cases. Your task is to modify the code in such a way so that it can pass all the test cases. Do not generate same code. 579 | 580 | {problem_with_planning} 581 | 582 | ### Buggy Code 583 | ```{language} 584 | {code} 585 | ``` 586 | 587 | ### Test Report 588 | 589 | {test_log} 590 | 591 | **Expected Output:** 592 | 593 | Your response must be structured as follows: 594 | 595 | ### Simulation with failed test case 596 | To detect where is the bug: 597 | - Take a sample test case where it fails. 598 | - Take the input go through each step according to the plan 599 | - You will get a output that must be different from the expected output. 600 | 601 | ### Debugging Notes 602 | Based on this simulation detect any of the following cases: 603 | - Plan is wrong 604 | - Plan is correct but plan to code generation is wrong. 605 | 606 | - Finally, discuss how to correct this code. 607 | 608 | ### Modified Code 609 | 610 | ```{language} 611 | # Your corrected code, with comments explaining each correction. 612 | ``` 613 | 614 | -------- 615 | **Important Instructions:** 616 | - Strictly follow the instructions. 617 | - Do not add testing code for example assert statement in your code. 618 | - Do not be overconfident that the generated code is correct. It is wrong. 619 | - The modified **{language}** code must be enclosed within triple backticks (```). 620 | - Your response must contain **Simulation with failed test case**, **Debugging Notes**, and **Modified Code** section. 621 | {std_input_prompt}""" 622 | 623 | -------------------------------------------------------------------------------- /src/promptings/variations/CodeSIMWD.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import json 5 | import re 6 | import sys 7 | import time 8 | 9 | from copy import deepcopy 10 | import xml.etree.ElementTree as ET 11 | 12 | from ..Base import BaseStrategy 13 | from ..Direct import DirectStrategy 14 | from models.Base import BaseModel 15 | 16 | 17 | from datasets.Dataset import Dataset 18 | from datasets.APPSDataset import APPSDataset 19 | from datasets.MBPPDataset import MBPPDataset 20 | from datasets.XCodeDataset import XCodeDataset 21 | from datasets.HumanEvalDataset import HumanDataset 22 | from datasets.CodeContestDataset import CodeContestDataset 23 | 24 | from evaluations.func_evaluate import evaluate_io 25 | 26 | from utils.parse import parse_response 27 | from constants.verboseType import * 28 | 29 | class CodeSIMWD(DirectStrategy): 30 | def __init__( 31 | self, 32 | additional_info_run=2, 33 | max_plan_try=5, 34 | max_debug_try=5, 35 | *args, 36 | **kwargs 37 | ): 38 | super().__init__(*args, **kwargs) 39 | 40 | 41 | self.additional_info_run=additional_info_run 42 | self.max_plan_try=max_plan_try 43 | self.max_debug_try=max_debug_try 44 | 45 | self.is_competative = type(self.data) == APPSDataset or \ 46 | type(self.data) == CodeContestDataset or \ 47 | type(self.data) == XCodeDataset 48 | 49 | # Cost reduction for competative programming 50 | if self.is_competative: 51 | self.max_plan_try = 3 52 | self.max_debug_try = 3 53 | 54 | 55 | if self.verbose >= VERBOSE_FULL: 56 | print("\n\n" + "_" * 70) 57 | print(f"Running CodeSIM with additional_info_run={additional_info_run}, max_plan_try={self.max_plan_try}, max_debug_try={self.max_debug_try}") 58 | print("\n", flush=True) 59 | 60 | 61 | @staticmethod 62 | def get_sample_io_str(sample_io: any) -> str: 63 | if len(sample_io) > 0: 64 | if type(sample_io[0]) == str: 65 | return "\n".join(sample_io) 66 | if type(sample_io[0]) == dict: 67 | return "\n".join([f"Input:\n{io['input']}\nExpected output:\n{io['output'][0]}" for io in sample_io]) 68 | return sample_io 69 | 70 | 71 | @staticmethod 72 | def process_test_log(test_logs: str): 73 | passed_test_cases = [] 74 | failed_test_cases = [] 75 | for test_log in test_logs.splitlines(): 76 | if test_log.startswith("Passed"): 77 | passed_test_cases.append(test_log[test_log.index("assert"):]) 78 | if test_log.startswith("Failed"): 79 | failed_test_cases.append(test_log[test_log.index("assert"):]) 80 | 81 | failed_test_cases_str = "\n".join(failed_test_cases) 82 | return f"### Test Cases where the generated code failed to generate the expected output:\n{failed_test_cases_str}" 83 | 84 | 85 | def parse_test_cases(self, test_cases: str): 86 | return [ 87 | test_case 88 | for test_case in test_cases.splitlines() 89 | if len(test_case) > 0 and test_case.startswith("assert") 90 | ] 91 | 92 | 93 | def check( 94 | self, 95 | data_row: dict, 96 | additional_io: List[str], 97 | code: str 98 | ) -> bool: 99 | passed_sample, test_log_sample = self.data.evaluate_sample_io( 100 | data_row, 101 | code, 102 | self.language 103 | ) 104 | 105 | passed_additional, test_log_additional = self.data.evaluate_additional_io( 106 | data_row[self.data.id_key], 107 | additional_io, 108 | code, 109 | self.language 110 | ) 111 | 112 | if self.is_competative: 113 | test_log_sample = test_log_sample[test_log_sample.find("## Tests failed:"):] 114 | test_log = test_log_sample + test_log_additional 115 | else: 116 | test_log = self.process_test_log(test_log_sample + test_log_additional) 117 | 118 | return passed_sample & passed_additional, test_log 119 | 120 | 121 | def run_single_pass(self, data_row: dict): 122 | print("", flush=True) 123 | 124 | problem = self.data.get_prompt(data_row) 125 | 126 | std_input_prompt = "" 127 | 128 | if self.is_competative: 129 | std_input_prompt = "- Strictly follow the input and output format. The input should be taken from Standard input and output should be given to standard output. If you are writing a function then after the function definition take input using `input()` function then call the function with specified parameters and finally print the output of the function. Do not add extra print statement otherwise it will failed the test cases." 130 | 131 | problem = problem[:problem.find("-------\nImportant Note:")] 132 | 133 | additional_io = [] 134 | self.run_details["additional_io"] = additional_io 135 | 136 | 137 | # Planning, Coding, Debugging 138 | for plan_no in range(1, self.max_plan_try + 1): 139 | # Planning Phase 140 | 141 | input_for_planning = [ 142 | { 143 | "role": "user", 144 | "content": prompt_for_planning_competative.format( 145 | problem=problem, 146 | language=self.language, 147 | ) 148 | }, 149 | ] 150 | 151 | if self.verbose >= VERBOSE_FULL: 152 | print("\n\n" + "_" * 70) 153 | print(f"Input for Planning: {plan_no}\n\n") 154 | print(input_for_planning[0]['content'], flush=True) 155 | 156 | response = self.gpt_chat( 157 | processed_input=input_for_planning 158 | ) 159 | 160 | if self.verbose >= VERBOSE_FULL: 161 | print("\n\n" + "_" * 70) 162 | print(f"Response from Planning: {plan_no}\n\n") 163 | print(response, flush=True) 164 | 165 | # if "```" in response: 166 | # plan = parse_response(response) 167 | # else: 168 | # plan = response[response.find("### Plan"):] 169 | 170 | if "### Plan" not in response: 171 | plan = f"### Plan\n\n{response}" 172 | else: 173 | plan = response[response.rfind("### Plan"):] 174 | 175 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 176 | 177 | # Simulation Phase 178 | input_for_simulation = [ 179 | { 180 | "role": "user", 181 | "content": prompt_for_simulation.format( 182 | problem_with_planning=problem_with_planning, 183 | language=self.language, 184 | ) 185 | }, 186 | ] 187 | 188 | if self.verbose >= VERBOSE_FULL: 189 | print("\n\n" + "_" * 70) 190 | print(f"Input for Simulation: {plan_no}\n\n") 191 | print(input_for_simulation[0]['content'], flush=True) 192 | 193 | response = self.gpt_chat( 194 | processed_input=input_for_simulation 195 | ) 196 | 197 | if self.verbose >= VERBOSE_FULL: 198 | print("\n\n" + "_" * 70) 199 | print(f"Response from Simulation: {plan_no}\n\n") 200 | print(response, flush=True) 201 | 202 | if "Plan Modification Needed" in response and \ 203 | "No Plan Modification Needed" not in response: 204 | if self.verbose >= VERBOSE_FULL: 205 | print("\n\n" + "_" * 70) 206 | print(f"**Plan Modification Needed.**\n") 207 | 208 | # Plan Refinement Phase 209 | input_for_plan_refinement = [ 210 | { 211 | "role": "user", 212 | "content": prompt_for_plan_refinement.format( 213 | problem_with_planning=problem_with_planning, 214 | language=self.language, 215 | critique=response 216 | ) 217 | }, 218 | ] 219 | 220 | if self.verbose >= VERBOSE_FULL: 221 | print("\n\n" + "_" * 70) 222 | print(f"Input for Plan Refinement: {plan_no}\n\n") 223 | print(input_for_plan_refinement[0]['content'], flush=True) 224 | 225 | plan = self.gpt_chat( 226 | processed_input=input_for_simulation 227 | ) 228 | 229 | if self.verbose >= VERBOSE_FULL: 230 | print("\n\n" + "_" * 70) 231 | print(f"Response from Plan Refinement: {plan_no}\n\n") 232 | print(plan, flush=True) 233 | 234 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 235 | 236 | # Code generation 237 | input_for_final_code_generation = [ 238 | { 239 | "role": "user", 240 | "content": prompt_for_code_generation.format( 241 | problem_with_planning=problem_with_planning, 242 | language=self.language, 243 | std_input_prompt=std_input_prompt, 244 | ) 245 | } 246 | ] 247 | 248 | if self.verbose >= VERBOSE_FULL: 249 | print("\n\n" + "_" * 70) 250 | print(f"Input for final code generation:\n\n") 251 | print(input_for_final_code_generation[0]['content'], flush=True) 252 | 253 | response = self.gpt_chat( 254 | input_for_final_code_generation 255 | ) 256 | 257 | if self.verbose >= VERBOSE_FULL: 258 | print("\n\n" + "_" * 70) 259 | print(f"Response from final code generation:\n\n") 260 | print(response, flush=True) 261 | 262 | code = parse_response(response) 263 | 264 | passed, test_log = self.check(data_row, additional_io, code) 265 | 266 | # Do not need to go for debugging steps 267 | if passed: 268 | break 269 | 270 | if self.verbose >= VERBOSE_FULL: 271 | print("\n\n" + "_" * 70) 272 | 273 | return code 274 | 275 | 276 | prompt_for_additional_io = """You are a tester tasked with creating comprehensive unit test cases for a given programming problem. 277 | 278 | ## Problem 279 | 280 | def maximum_segments(n, a, b, c): 281 | ''' 282 | Write a Python function to find the maximum number of segments of lengths a, b, and c 283 | that can be formed from n. 284 | ''' 285 | 286 | ### Problem Understanding 287 | 288 | The task is to maximize the number of segments you can cut from a total length `n`, where the possible segment lengths are `a`, `b`, and `c`. Let say we have a rope of length `n` meter. We need to cut it into segments. Possible segment length is `a`, `b`, and `c`. There may be many possible way of doing these segments. We need to find out the maximum number of segments from that rope. 289 | 290 | ### Test Cases 291 | assert maximum_segments(7, 5, 2, 5) == 2 292 | assert maximum_segments(17, 2, 1, 3) == 17 293 | assert maximum_segments(18, 16, 3, 6) == 6 294 | assert maximum_segments(11, 8, 4, 9) == -1 295 | assert maximum_segments(5, 9, 6, 10) == -1 296 | 297 | --- 298 | 299 | ## Problem 300 | 301 | {problem} 302 | 303 | -------- 304 | **Important Instruction:** 305 | For the problem `{problem_name}` 306 | - First, understand the problem `{problem_name}` and write down the understanding inside **Problem Understanding** section. 307 | - Then Generate five (05) unit test cases that cover both: 308 | - **Normal** and **Edge** case scenarios 309 | - **Positive** and **Negative** case scenarios 310 | - **Valid** and **Invalid** case scenarios 311 | inside **Test Cases** section. 312 | - Write down each test case in a single line following the pattern shown in the example problem. 313 | - Do not generate any code to solve this problem. 314 | """ 315 | 316 | 317 | prompt_for_initial_code_generation = """{problem} 318 | 319 | -------- 320 | Important Instructions: 321 | - Generate {language} code step-by-step to solve the above mentioned problem. 322 | - Do not generate any explanation. 323 | - The generated **{language}** code must be enclosed within triple backticks (```). 324 | {std_input_prompt}""" 325 | 326 | 327 | prompt_for_code_validation = """You are a tester tasked with checking a code for a given problem. 328 | 329 | --- 330 | 331 | ## Problem 332 | 333 | {problem} 334 | 335 | ## Code 336 | 337 | {code} 338 | 339 | --- 340 | 341 | **Your output must follow the steps below:** 342 | - Try to generate a test case other than the sample test cases that are mentioned inside the problem. 343 | - Take a the input and apply the code step by step to get the output. 344 | - Compare the generated output with the expected output to verify if the generated code is ok or not. 345 | - Write **Buggy Code** if you find such a test case otherwise write **Code is ok**. 346 | """ 347 | 348 | 349 | prompt_for_planning = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 350 | 351 | ## Problem 352 | 353 | {problem} 354 | 355 | **Expected Output:** 356 | 357 | Your response must be structured as follows: 358 | 359 | ### Problem Understanding 360 | 361 | Think about the original problem. Develop an initial understanding about the problem. 362 | 363 | ### Recall Example Problem 364 | 365 | Recall a relevant and distinct problems (different from problem mentioned above) and 366 | - describe it 367 | - generate {language} code step by step to solve that problem 368 | - finally generate a planning to solve that problem 369 | 370 | ### Plan 371 | 372 | - Write down a detailed, step-by-step plan to solve the **original problem**. 373 | 374 | -------- 375 | **Important Instruction:** 376 | - Strictly follow the instructions. 377 | - Do not generate code. 378 | """ 379 | 380 | 381 | prompt_for_planning_competative = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 382 | 383 | ## Problem 384 | 385 | {problem} 386 | 387 | **Expected Output:** 388 | 389 | Your response must be structured as follows: 390 | 391 | ### Problem Understanding 392 | 393 | - Think about the original problem. Develop an initial understanding about the problem. 394 | 395 | ### Recall Example Problem 396 | 397 | Recall a relevant and distinct problems (different from problem mentioned above) and 398 | - Describe it 399 | - Generate {language} code step by step to solve that problem 400 | - Discuss the algorithm to solve this problem 401 | - Finally generate a planning to solve that problem 402 | 403 | ### Algorithm to solve the original problem 404 | 405 | - Write down the algorithm that is well suited for the original problem 406 | - Give some tutorials to about the algorithm for example: 407 | - How to approach this type of algorithm 408 | - Important things to consider 409 | 410 | ### Plan 411 | 412 | - Write down a detailed, step-by-step plan to solve the **original problem**. 413 | 414 | -------- 415 | **Important Instruction:** 416 | - Strictly follow the instructions. 417 | - Do not generate code. 418 | """ 419 | 420 | 421 | prompt_for_simulation = """You are a programmer tasked with verifying a plan to solve a given problem using the **{language}** programming language. 422 | 423 | {problem_with_planning} 424 | 425 | **Expected Output:** 426 | 427 | Your response must be structured as follows: 428 | 429 | ### Simulation 430 | 431 | - Take a sample input and apply plan step by step to get the output. 432 | - Compare the generated output with the sample output to verify if your plan works as expected. 433 | 434 | ### Plan Evaluation 435 | 436 | - If the simulation is successful write **No Need to Modify Plan**. 437 | - Otherwise write **Plan Modification Needed**. 438 | 439 | """ 440 | 441 | 442 | prompt_for_plan_refinement = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. You already have a wrong plan. Correct it so that it can generate correct code. 443 | 444 | {problem_with_planning} 445 | 446 | ## Plan Critique 447 | 448 | {critique} 449 | 450 | **Expected Output:** 451 | 452 | Your response must be structured as follows: 453 | 454 | ## New Plan 455 | 456 | - Write down a detailed, step-by-step modified plan to solve the **original problem**. 457 | - Ensure each step logically follows from the previous one. 458 | 459 | -------- 460 | **Important Instruction:** 461 | - Your response must contain only the plan. 462 | - Do not add any explanation. 463 | - Do not generate code. 464 | """ 465 | 466 | 467 | 468 | prompt_for_code_generation = """You are a programmer tasked with solving a given problem using the **{language}** programming language. See the plan to solve the plan and implement code to solve it. 469 | 470 | {problem_with_planning} 471 | 472 | -------- 473 | **Important Instructions:** 474 | - Do not add any explanation. 475 | - The generated **{language}** code must be inside a triple backtick (```) code block. 476 | {std_input_prompt}""" 477 | 478 | 479 | prompt_for_debugging = """You are a programmer who has received a solution of a problem written in **{language}** that fails to pass certain test cases. Your task is to modify the code in such a way so that it can pass all the test cases. Do not generate same code. 480 | 481 | {problem_with_planning} 482 | 483 | ### Buggy Code 484 | ```{language} 485 | {code} 486 | ``` 487 | 488 | ### Test Report 489 | 490 | {test_log} 491 | 492 | **Expected Output:** 493 | 494 | Your response must be structured as follows: 495 | 496 | ### Simulation with failed test case 497 | To detect where is the bug: 498 | - Take a sample test case where it fails. 499 | - Take the input go through each step according to the plan 500 | - You will get a output that must be different from the expected output. 501 | 502 | ### Debugging Notes 503 | Based on this simulation detect any of the following cases: 504 | - Plan is wrong 505 | - Plan is correct but plan to code generation is wrong. 506 | 507 | - Finally, discuss how to correct this code. 508 | 509 | ### Modified Code 510 | 511 | ```{language} 512 | # Your corrected code, with comments explaining each correction. 513 | ``` 514 | 515 | -------- 516 | **Important Instructions:** 517 | - Strictly follow the instructions. 518 | - Do not add testing code for example assert statement in your code. 519 | - Do not be overconfident that the generated code is correct. It is wrong. 520 | - The modified **{language}** code must be enclosed within triple backticks (```). 521 | - Your response must contain **Simulation with failed test case**, **Debugging Notes**, and **Modified Code** section. 522 | {std_input_prompt}""" 523 | 524 | -------------------------------------------------------------------------------- /src/promptings/variations/CodeSIMWPV.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import json 5 | import re 6 | import sys 7 | import time 8 | 9 | from copy import deepcopy 10 | import xml.etree.ElementTree as ET 11 | 12 | from ..Base import BaseStrategy 13 | from ..Direct import DirectStrategy 14 | from models.Base import BaseModel 15 | 16 | from datasets.Dataset import Dataset 17 | from datasets.APPSDataset import APPSDataset 18 | from datasets.MBPPDataset import MBPPDataset 19 | from datasets.XCodeDataset import XCodeDataset 20 | from datasets.HumanEvalDataset import HumanDataset 21 | from datasets.CodeContestDataset import CodeContestDataset 22 | 23 | from evaluations.func_evaluate import evaluate_io 24 | 25 | from utils.parse import parse_response 26 | from constants.verboseType import * 27 | 28 | class CodeSIMWPV(DirectStrategy): 29 | def __init__( 30 | self, 31 | additional_info_run=2, 32 | max_plan_try=5, 33 | max_debug_try=5, 34 | *args, 35 | **kwargs 36 | ): 37 | super().__init__(*args, **kwargs) 38 | 39 | 40 | self.additional_info_run=additional_info_run 41 | self.max_plan_try=max_plan_try 42 | self.max_debug_try=max_debug_try 43 | 44 | self.is_competative = type(self.data) == APPSDataset or \ 45 | type(self.data) == CodeContestDataset or \ 46 | type(self.data) == XCodeDataset 47 | 48 | # Cost reduction for competative programming 49 | if self.is_competative: 50 | self.max_plan_try = 3 51 | self.max_debug_try = 3 52 | 53 | 54 | if self.verbose >= VERBOSE_FULL: 55 | print("\n\n" + "_" * 70) 56 | print(f"Running CodeSIM with additional_info_run={additional_info_run}, max_plan_try={self.max_plan_try}, max_debug_try={self.max_debug_try}") 57 | print("\n", flush=True) 58 | 59 | 60 | @staticmethod 61 | def get_sample_io_str(sample_io: any) -> str: 62 | if len(sample_io) > 0: 63 | if type(sample_io[0]) == str: 64 | return "\n".join(sample_io) 65 | if type(sample_io[0]) == dict: 66 | return "\n".join([f"Input:\n{io['input']}\nExpected output:\n{io['output'][0]}" for io in sample_io]) 67 | return sample_io 68 | 69 | 70 | @staticmethod 71 | def process_test_log(test_logs: str): 72 | passed_test_cases = [] 73 | failed_test_cases = [] 74 | for test_log in test_logs.splitlines(): 75 | if test_log.startswith("Passed"): 76 | passed_test_cases.append(test_log[test_log.index("assert"):]) 77 | if test_log.startswith("Failed"): 78 | failed_test_cases.append(test_log[test_log.index("assert"):]) 79 | 80 | failed_test_cases_str = "\n".join(failed_test_cases) 81 | return f"### Test Cases where the generated code failed to generate the expected output:\n{failed_test_cases_str}" 82 | 83 | 84 | 85 | def parse_test_cases(self, test_cases: str): 86 | return [ 87 | test_case 88 | for test_case in test_cases.splitlines() 89 | if len(test_case) > 0 and test_case.startswith("assert") 90 | ] 91 | 92 | 93 | def check( 94 | self, 95 | data_row: dict, 96 | additional_io: List[str], 97 | code: str 98 | ) -> bool: 99 | passed_sample, test_log_sample = self.data.evaluate_sample_io( 100 | data_row, 101 | code, 102 | self.language 103 | ) 104 | 105 | passed_additional, test_log_additional = self.data.evaluate_additional_io( 106 | data_row[self.data.id_key], 107 | additional_io, 108 | code, 109 | self.language 110 | ) 111 | 112 | if self.is_competative: 113 | test_log_sample = test_log_sample[test_log_sample.find("## Tests failed:"):] 114 | test_log = test_log_sample + test_log_additional 115 | else: 116 | test_log = self.process_test_log(test_log_sample + test_log_additional) 117 | 118 | return passed_sample & passed_additional, test_log 119 | 120 | 121 | def run_single_pass(self, data_row: dict): 122 | print("", flush=True) 123 | 124 | problem = self.data.get_prompt(data_row) 125 | 126 | std_input_prompt = "" 127 | 128 | if self.is_competative: 129 | std_input_prompt = "- Strictly follow the input and output format. The input should be taken from Standard input and output should be given to standard output. If you are writing a function then after the function definition take input using `input()` function then call the function with specified parameters and finally print the output of the function. Do not add extra print statement otherwise it will failed the test cases." 130 | 131 | problem = problem[:problem.find("-------\nImportant Note:")] 132 | 133 | additional_io = [] 134 | 135 | self.run_details["additional_io"] = additional_io 136 | 137 | 138 | # Planning, Coding, Debugging 139 | for plan_no in range(1, self.max_plan_try + 1): 140 | # Planning Phase 141 | 142 | # if self.is_competative: 143 | input_for_planning = [ 144 | { 145 | "role": "user", 146 | "content": prompt_for_planning_competative.format( 147 | problem=problem, 148 | language=self.language, 149 | ) 150 | }, 151 | ] 152 | # else: 153 | # input_for_planning = [ 154 | # { 155 | # "role": "user", 156 | # "content": prompt_for_planning.format( 157 | # problem=problem, 158 | # language=self.language, 159 | # ) 160 | # }, 161 | # ] 162 | 163 | if self.verbose >= VERBOSE_FULL: 164 | print("\n\n" + "_" * 70) 165 | print(f"Input for Planning: {plan_no}\n\n") 166 | print(input_for_planning[0]['content'], flush=True) 167 | 168 | response = self.gpt_chat( 169 | processed_input=input_for_planning 170 | ) 171 | 172 | if self.verbose >= VERBOSE_FULL: 173 | print("\n\n" + "_" * 70) 174 | print(f"Response from Planning: {plan_no}\n\n") 175 | print(response, flush=True) 176 | 177 | # if "```" in response: 178 | # plan = parse_response(response) 179 | # else: 180 | # plan = response[response.find("### Plan"):] 181 | 182 | if "### Plan" not in response: 183 | plan = f"### Plan\n\n{response}" 184 | else: 185 | plan = response[response.rfind("### Plan"):] 186 | 187 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 188 | 189 | # Code generation 190 | input_for_final_code_generation = [ 191 | { 192 | "role": "user", 193 | "content": prompt_for_code_generation.format( 194 | problem_with_planning=problem_with_planning, 195 | language=self.language, 196 | std_input_prompt=std_input_prompt, 197 | ) 198 | } 199 | ] 200 | 201 | if self.verbose >= VERBOSE_FULL: 202 | print("\n\n" + "_" * 70) 203 | print(f"Input for final code generation:\n\n") 204 | print(input_for_final_code_generation[0]['content'], flush=True) 205 | 206 | response = self.gpt_chat( 207 | input_for_final_code_generation 208 | ) 209 | 210 | if self.verbose >= VERBOSE_FULL: 211 | print("\n\n" + "_" * 70) 212 | print(f"Response from final code generation:\n\n") 213 | print(response, flush=True) 214 | 215 | code = parse_response(response) 216 | 217 | passed, test_log = self.check(data_row, additional_io, code) 218 | 219 | # Do not need to go for debugging steps 220 | if passed: 221 | break 222 | 223 | # problem_with_solution = f"{problem_with_planning}\n\n### Code:\n\n```{self.language}\n{code}\n```" 224 | 225 | # Debugging 226 | for debug_no in range(1, self.max_debug_try + 1): 227 | 228 | input_for_debugging = [ 229 | { 230 | "role": "user", 231 | "content": prompt_for_debugging.format( 232 | problem_with_planning=problem_with_planning, 233 | code=code, 234 | language=self.language, 235 | test_log=test_log, 236 | std_input_prompt=std_input_prompt, 237 | ) 238 | } 239 | ] 240 | 241 | if self.verbose >= VERBOSE_FULL: 242 | print("\n\n" + "_" * 70) 243 | print(f"Input for Improving code: {plan_no}, {debug_no}\n\n") 244 | print(input_for_debugging[0]['content'], flush=True) 245 | 246 | response = self.gpt_chat(input_for_debugging) 247 | 248 | if self.verbose >= VERBOSE_FULL: 249 | print("\n\n" + "_" * 70) 250 | print(f"Response from Improving code: {plan_no}, {debug_no}\n\n") 251 | print(response, flush=True) 252 | 253 | code = parse_response(response) 254 | 255 | passed, test_log = self.check(data_row, additional_io, code) 256 | 257 | # Passed so breaking this debugging loop 258 | if passed: 259 | break 260 | 261 | if passed: 262 | break 263 | 264 | if self.verbose >= VERBOSE_FULL: 265 | print("\n\n" + "_" * 70) 266 | 267 | return code 268 | 269 | 270 | prompt_for_additional_io = """You are a tester tasked with creating comprehensive unit test cases for a given programming problem. 271 | 272 | ## Problem 273 | 274 | def maximum_segments(n, a, b, c): 275 | ''' 276 | Write a Python function to find the maximum number of segments of lengths a, b, and c 277 | that can be formed from n. 278 | ''' 279 | 280 | ### Problem Understanding 281 | 282 | The task is to maximize the number of segments you can cut from a total length `n`, where the possible segment lengths are `a`, `b`, and `c`. Let say we have a rope of length `n` meter. We need to cut it into segments. Possible segment length is `a`, `b`, and `c`. There may be many possible way of doing these segments. We need to find out the maximum number of segments from that rope. 283 | 284 | ### Test Cases 285 | assert maximum_segments(7, 5, 2, 5) == 2 286 | assert maximum_segments(17, 2, 1, 3) == 17 287 | assert maximum_segments(18, 16, 3, 6) == 6 288 | assert maximum_segments(11, 8, 4, 9) == -1 289 | assert maximum_segments(5, 9, 6, 10) == -1 290 | 291 | --- 292 | 293 | ## Problem 294 | 295 | {problem} 296 | 297 | -------- 298 | **Important Instruction:** 299 | For the problem `{problem_name}` 300 | - First, understand the problem `{problem_name}` and write down the understanding inside **Problem Understanding** section. 301 | - Then Generate five (05) unit test cases that cover both: 302 | - **Normal** and **Edge** case scenarios 303 | - **Positive** and **Negative** case scenarios 304 | - **Valid** and **Invalid** case scenarios 305 | inside **Test Cases** section. 306 | - Write down each test case in a single line following the pattern shown in the example problem. 307 | - Do not generate any code to solve this problem. 308 | """ 309 | 310 | 311 | prompt_for_initial_code_generation = """{problem} 312 | 313 | -------- 314 | Important Instructions: 315 | - Generate {language} code step-by-step to solve the above mentioned problem. 316 | - Do not generate any explanation. 317 | - The generated **{language}** code must be enclosed within triple backticks (```). 318 | {std_input_prompt}""" 319 | 320 | 321 | prompt_for_code_validation = """You are a tester tasked with checking a code for a given problem. 322 | 323 | --- 324 | 325 | ## Problem 326 | 327 | {problem} 328 | 329 | ## Code 330 | 331 | {code} 332 | 333 | --- 334 | 335 | **Your output must follow the steps below:** 336 | - Try to generate a test case other than the sample test cases that are mentioned inside the problem. 337 | - Take a the input and apply the code step by step to get the output. 338 | - Compare the generated output with the expected output to verify if the generated code is ok or not. 339 | - Write **Buggy Code** if you find such a test case otherwise write **Code is ok**. 340 | """ 341 | 342 | 343 | prompt_for_planning = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 344 | 345 | ## Problem 346 | 347 | {problem} 348 | 349 | **Expected Output:** 350 | 351 | Your response must be structured as follows: 352 | 353 | ### Problem Understanding 354 | 355 | Think about the original problem. Develop an initial understanding about the problem. 356 | 357 | ### Recall Example Problem 358 | 359 | Recall a relevant and distinct problems (different from problem mentioned above) and 360 | - describe it 361 | - generate {language} code step by step to solve that problem 362 | - finally generate a planning to solve that problem 363 | 364 | ### Plan 365 | 366 | - Write down a detailed, step-by-step plan to solve the **original problem**. 367 | 368 | -------- 369 | **Important Instruction:** 370 | - Strictly follow the instructions. 371 | - Do not generate code. 372 | """ 373 | 374 | 375 | prompt_for_planning_competative = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 376 | 377 | ## Problem 378 | 379 | {problem} 380 | 381 | **Expected Output:** 382 | 383 | Your response must be structured as follows: 384 | 385 | ### Problem Understanding 386 | 387 | - Think about the original problem. Develop an initial understanding about the problem. 388 | 389 | ### Recall Example Problem 390 | 391 | Recall a relevant and distinct problems (different from problem mentioned above) and 392 | - Describe it 393 | - Generate {language} code step by step to solve that problem 394 | - Discuss the algorithm to solve this problem 395 | - Finally generate a planning to solve that problem 396 | 397 | ### Algorithm to solve the original problem 398 | 399 | - Write down the algorithm that is well suited for the original problem 400 | - Give some tutorials to about the algorithm for example: 401 | - How to approach this type of algorithm 402 | - Important things to consider 403 | 404 | ### Plan 405 | 406 | - Write down a detailed, step-by-step plan to solve the **original problem**. 407 | 408 | -------- 409 | **Important Instruction:** 410 | - Strictly follow the instructions. 411 | - Do not generate code. 412 | """ 413 | 414 | 415 | prompt_for_simulation = """You are a programmer tasked with verifying a plan to solve a given problem using the **{language}** programming language. 416 | 417 | {problem_with_planning} 418 | 419 | **Expected Output:** 420 | 421 | Your response must be structured as follows: 422 | 423 | ### Simulation 424 | 425 | - Take a sample input and apply plan step by step to get the output. 426 | - Compare the generated output with the sample output to verify if your plan works as expected. 427 | 428 | ### Plan Evaluation 429 | 430 | - If the simulation is successful write **No Need to Modify Plan**. 431 | - Otherwise write **Plan Modification Needed**. 432 | 433 | """ 434 | 435 | 436 | prompt_for_plan_refinement = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. You already have a wrong plan. Correct it so that it can generate correct code. 437 | 438 | {problem_with_planning} 439 | 440 | ## Plan Critique 441 | 442 | {critique} 443 | 444 | **Expected Output:** 445 | 446 | Your response must be structured as follows: 447 | 448 | ## New Plan 449 | 450 | - Write down a detailed, step-by-step modified plan to solve the **original problem**. 451 | - Ensure each step logically follows from the previous one. 452 | 453 | -------- 454 | **Important Instruction:** 455 | - Your response must contain only the plan. 456 | - Do not add any explanation. 457 | - Do not generate code. 458 | """ 459 | 460 | 461 | 462 | prompt_for_code_generation = """You are a programmer tasked with solving a given problem using the **{language}** programming language. See the plan to solve the plan and implement code to solve it. 463 | 464 | {problem_with_planning} 465 | 466 | -------- 467 | **Important Instructions:** 468 | - Do not add any explanation. 469 | - The generated **{language}** code must be inside a triple backtick (```) code block. 470 | {std_input_prompt}""" 471 | 472 | 473 | prompt_for_debugging = """You are a programmer who has received a solution of a problem written in **{language}** that fails to pass certain test cases. Your task is to modify the code in such a way so that it can pass all the test cases. Do not generate same code. 474 | 475 | {problem_with_planning} 476 | 477 | ### Buggy Code 478 | ```{language} 479 | {code} 480 | ``` 481 | 482 | ### Test Report 483 | 484 | {test_log} 485 | 486 | **Expected Output:** 487 | 488 | Your response must be structured as follows: 489 | 490 | ### Simulation with failed test case 491 | To detect where is the bug: 492 | - Take a sample test case where it fails. 493 | - Take the input go through each step according to the plan 494 | - You will get a output that must be different from the expected output. 495 | 496 | ### Debugging Notes 497 | Based on this simulation detect any of the following cases: 498 | - Plan is wrong 499 | - Plan is correct but plan to code generation is wrong. 500 | 501 | - Finally, discuss how to correct this code. 502 | 503 | ### Modified Code 504 | 505 | ```{language} 506 | # Your corrected code, with comments explaining each correction. 507 | ``` 508 | 509 | -------- 510 | **Important Instructions:** 511 | - Strictly follow the instructions. 512 | - Do not add testing code for example assert statement in your code. 513 | - Do not be overconfident that the generated code is correct. It is wrong. 514 | - The modified **{language}** code must be enclosed within triple backticks (```). 515 | - Your response must contain **Simulation with failed test case**, **Debugging Notes**, and **Modified Code** section. 516 | {std_input_prompt}""" 517 | 518 | -------------------------------------------------------------------------------- /src/promptings/variations/CodeSIMWPVD.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import tiktoken 3 | import os 4 | import json 5 | import re 6 | import sys 7 | import time 8 | 9 | from copy import deepcopy 10 | import xml.etree.ElementTree as ET 11 | 12 | from ..Base import BaseStrategy 13 | from ..Direct import DirectStrategy 14 | from models.Base import BaseModel 15 | 16 | from datasets.Dataset import Dataset 17 | from datasets.APPSDataset import APPSDataset 18 | from datasets.MBPPDataset import MBPPDataset 19 | from datasets.XCodeDataset import XCodeDataset 20 | from datasets.HumanEvalDataset import HumanDataset 21 | from datasets.CodeContestDataset import CodeContestDataset 22 | 23 | from evaluations.func_evaluate import evaluate_io 24 | 25 | from utils.parse import parse_response 26 | from constants.verboseType import * 27 | 28 | class CodeSIMWPVD(DirectStrategy): 29 | def __init__( 30 | self, 31 | additional_info_run=2, 32 | max_plan_try=5, 33 | max_debug_try=5, 34 | *args, 35 | **kwargs 36 | ): 37 | super().__init__(*args, **kwargs) 38 | 39 | 40 | self.additional_info_run=additional_info_run 41 | self.max_plan_try=max_plan_try 42 | self.max_debug_try=max_debug_try 43 | 44 | self.is_competative = type(self.data) == APPSDataset or \ 45 | type(self.data) == CodeContestDataset or \ 46 | type(self.data) == XCodeDataset 47 | 48 | # Cost reduction for competative programming 49 | if self.is_competative: 50 | self.max_plan_try = 3 51 | self.max_debug_try = 3 52 | 53 | 54 | if self.verbose >= VERBOSE_FULL: 55 | print("\n\n" + "_" * 70) 56 | print(f"Running CodeSIM with additional_info_run={additional_info_run}, max_plan_try={self.max_plan_try}, max_debug_try={self.max_debug_try}") 57 | print("\n", flush=True) 58 | 59 | 60 | @staticmethod 61 | def get_sample_io_str(sample_io: any) -> str: 62 | if len(sample_io) > 0: 63 | if type(sample_io[0]) == str: 64 | return "\n".join(sample_io) 65 | if type(sample_io[0]) == dict: 66 | return "\n".join([f"Input:\n{io['input']}\nExpected output:\n{io['output'][0]}" for io in sample_io]) 67 | return sample_io 68 | 69 | 70 | @staticmethod 71 | def process_test_log(test_logs: str): 72 | passed_test_cases = [] 73 | failed_test_cases = [] 74 | for test_log in test_logs.splitlines(): 75 | if test_log.startswith("Passed"): 76 | passed_test_cases.append(test_log[test_log.index("assert"):]) 77 | if test_log.startswith("Failed"): 78 | failed_test_cases.append(test_log[test_log.index("assert"):]) 79 | 80 | failed_test_cases_str = "\n".join(failed_test_cases) 81 | return f"### Test Cases where the generated code failed to generate the expected output:\n{failed_test_cases_str}" 82 | 83 | 84 | 85 | def parse_test_cases(self, test_cases: str): 86 | return [ 87 | test_case 88 | for test_case in test_cases.splitlines() 89 | if len(test_case) > 0 and test_case.startswith("assert") 90 | ] 91 | 92 | 93 | def check( 94 | self, 95 | data_row: dict, 96 | additional_io: List[str], 97 | code: str 98 | ) -> bool: 99 | passed_sample, test_log_sample = self.data.evaluate_sample_io( 100 | data_row, 101 | code, 102 | self.language 103 | ) 104 | 105 | passed_additional, test_log_additional = self.data.evaluate_additional_io( 106 | data_row[self.data.id_key], 107 | additional_io, 108 | code, 109 | self.language 110 | ) 111 | 112 | if self.is_competative: 113 | test_log_sample = test_log_sample[test_log_sample.find("## Tests failed:"):] 114 | test_log = test_log_sample + test_log_additional 115 | else: 116 | test_log = self.process_test_log(test_log_sample + test_log_additional) 117 | 118 | return passed_sample & passed_additional, test_log 119 | 120 | 121 | def run_single_pass(self, data_row: dict): 122 | print("", flush=True) 123 | 124 | problem = self.data.get_prompt(data_row) 125 | 126 | std_input_prompt = "" 127 | 128 | if self.is_competative: 129 | std_input_prompt = "- Strictly follow the input and output format. The input should be taken from Standard input and output should be given to standard output. If you are writing a function then after the function definition take input using `input()` function then call the function with specified parameters and finally print the output of the function. Do not add extra print statement otherwise it will failed the test cases." 130 | 131 | problem = problem[:problem.find("-------\nImportant Note:")] 132 | 133 | additional_io = [] 134 | 135 | self.run_details["additional_io"] = additional_io 136 | 137 | 138 | # Planning, Coding, Debugging 139 | for plan_no in range(1, self.max_plan_try + 1): 140 | # Planning Phase 141 | 142 | # if self.is_competative: 143 | input_for_planning = [ 144 | { 145 | "role": "user", 146 | "content": prompt_for_planning_competative.format( 147 | problem=problem, 148 | language=self.language, 149 | ) 150 | }, 151 | ] 152 | # else: 153 | # input_for_planning = [ 154 | # { 155 | # "role": "user", 156 | # "content": prompt_for_planning.format( 157 | # problem=problem, 158 | # language=self.language, 159 | # ) 160 | # }, 161 | # ] 162 | 163 | if self.verbose >= VERBOSE_FULL: 164 | print("\n\n" + "_" * 70) 165 | print(f"Input for Planning: {plan_no}\n\n") 166 | print(input_for_planning[0]['content'], flush=True) 167 | 168 | response = self.gpt_chat( 169 | processed_input=input_for_planning 170 | ) 171 | 172 | if self.verbose >= VERBOSE_FULL: 173 | print("\n\n" + "_" * 70) 174 | print(f"Response from Planning: {plan_no}\n\n") 175 | print(response, flush=True) 176 | 177 | # if "```" in response: 178 | # plan = parse_response(response) 179 | # else: 180 | # plan = response[response.find("### Plan"):] 181 | 182 | if "### Plan" not in response: 183 | plan = f"### Plan\n\n{response}" 184 | else: 185 | plan = response[response.rfind("### Plan"):] 186 | 187 | problem_with_planning = f"## Problem:\n{problem}\n\n{plan}" 188 | 189 | # Code generation 190 | input_for_final_code_generation = [ 191 | { 192 | "role": "user", 193 | "content": prompt_for_code_generation.format( 194 | problem_with_planning=problem_with_planning, 195 | language=self.language, 196 | std_input_prompt=std_input_prompt, 197 | ) 198 | } 199 | ] 200 | 201 | if self.verbose >= VERBOSE_FULL: 202 | print("\n\n" + "_" * 70) 203 | print(f"Input for final code generation:\n\n") 204 | print(input_for_final_code_generation[0]['content'], flush=True) 205 | 206 | response = self.gpt_chat( 207 | input_for_final_code_generation 208 | ) 209 | 210 | if self.verbose >= VERBOSE_FULL: 211 | print("\n\n" + "_" * 70) 212 | print(f"Response from final code generation:\n\n") 213 | print(response, flush=True) 214 | 215 | code = parse_response(response) 216 | 217 | passed, test_log = self.check(data_row, additional_io, code) 218 | 219 | # Do not need to go for debugging steps 220 | if passed: 221 | break 222 | 223 | 224 | if self.verbose >= VERBOSE_FULL: 225 | print("\n\n" + "_" * 70) 226 | 227 | return code 228 | 229 | 230 | prompt_for_additional_io = """You are a tester tasked with creating comprehensive unit test cases for a given programming problem. 231 | 232 | ## Problem 233 | 234 | def maximum_segments(n, a, b, c): 235 | ''' 236 | Write a Python function to find the maximum number of segments of lengths a, b, and c 237 | that can be formed from n. 238 | ''' 239 | 240 | ### Problem Understanding 241 | 242 | The task is to maximize the number of segments you can cut from a total length `n`, where the possible segment lengths are `a`, `b`, and `c`. Let say we have a rope of length `n` meter. We need to cut it into segments. Possible segment length is `a`, `b`, and `c`. There may be many possible way of doing these segments. We need to find out the maximum number of segments from that rope. 243 | 244 | ### Test Cases 245 | assert maximum_segments(7, 5, 2, 5) == 2 246 | assert maximum_segments(17, 2, 1, 3) == 17 247 | assert maximum_segments(18, 16, 3, 6) == 6 248 | assert maximum_segments(11, 8, 4, 9) == -1 249 | assert maximum_segments(5, 9, 6, 10) == -1 250 | 251 | --- 252 | 253 | ## Problem 254 | 255 | {problem} 256 | 257 | -------- 258 | **Important Instruction:** 259 | For the problem `{problem_name}` 260 | - First, understand the problem `{problem_name}` and write down the understanding inside **Problem Understanding** section. 261 | - Then Generate five (05) unit test cases that cover both: 262 | - **Normal** and **Edge** case scenarios 263 | - **Positive** and **Negative** case scenarios 264 | - **Valid** and **Invalid** case scenarios 265 | inside **Test Cases** section. 266 | - Write down each test case in a single line following the pattern shown in the example problem. 267 | - Do not generate any code to solve this problem. 268 | """ 269 | 270 | 271 | prompt_for_initial_code_generation = """{problem} 272 | 273 | -------- 274 | Important Instructions: 275 | - Generate {language} code step-by-step to solve the above mentioned problem. 276 | - Do not generate any explanation. 277 | - The generated **{language}** code must be enclosed within triple backticks (```). 278 | {std_input_prompt}""" 279 | 280 | 281 | prompt_for_code_validation = """You are a tester tasked with checking a code for a given problem. 282 | 283 | --- 284 | 285 | ## Problem 286 | 287 | {problem} 288 | 289 | ## Code 290 | 291 | {code} 292 | 293 | --- 294 | 295 | **Your output must follow the steps below:** 296 | - Try to generate a test case other than the sample test cases that are mentioned inside the problem. 297 | - Take a the input and apply the code step by step to get the output. 298 | - Compare the generated output with the expected output to verify if the generated code is ok or not. 299 | - Write **Buggy Code** if you find such a test case otherwise write **Code is ok**. 300 | """ 301 | 302 | 303 | prompt_for_planning = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 304 | 305 | ## Problem 306 | 307 | {problem} 308 | 309 | **Expected Output:** 310 | 311 | Your response must be structured as follows: 312 | 313 | ### Problem Understanding 314 | 315 | Think about the original problem. Develop an initial understanding about the problem. 316 | 317 | ### Recall Example Problem 318 | 319 | Recall a relevant and distinct problems (different from problem mentioned above) and 320 | - describe it 321 | - generate {language} code step by step to solve that problem 322 | - finally generate a planning to solve that problem 323 | 324 | ### Plan 325 | 326 | - Write down a detailed, step-by-step plan to solve the **original problem**. 327 | 328 | -------- 329 | **Important Instruction:** 330 | - Strictly follow the instructions. 331 | - Do not generate code. 332 | """ 333 | 334 | 335 | prompt_for_planning_competative = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. 336 | 337 | ## Problem 338 | 339 | {problem} 340 | 341 | **Expected Output:** 342 | 343 | Your response must be structured as follows: 344 | 345 | ### Problem Understanding 346 | 347 | - Think about the original problem. Develop an initial understanding about the problem. 348 | 349 | ### Recall Example Problem 350 | 351 | Recall a relevant and distinct problems (different from problem mentioned above) and 352 | - Describe it 353 | - Generate {language} code step by step to solve that problem 354 | - Discuss the algorithm to solve this problem 355 | - Finally generate a planning to solve that problem 356 | 357 | ### Algorithm to solve the original problem 358 | 359 | - Write down the algorithm that is well suited for the original problem 360 | - Give some tutorials to about the algorithm for example: 361 | - How to approach this type of algorithm 362 | - Important things to consider 363 | 364 | ### Plan 365 | 366 | - Write down a detailed, step-by-step plan to solve the **original problem**. 367 | 368 | -------- 369 | **Important Instruction:** 370 | - Strictly follow the instructions. 371 | - Do not generate code. 372 | """ 373 | 374 | 375 | prompt_for_simulation = """You are a programmer tasked with verifying a plan to solve a given problem using the **{language}** programming language. 376 | 377 | {problem_with_planning} 378 | 379 | **Expected Output:** 380 | 381 | Your response must be structured as follows: 382 | 383 | ### Simulation 384 | 385 | - Take a sample input and apply plan step by step to get the output. 386 | - Compare the generated output with the sample output to verify if your plan works as expected. 387 | 388 | ### Plan Evaluation 389 | 390 | - If the simulation is successful write **No Need to Modify Plan**. 391 | - Otherwise write **Plan Modification Needed**. 392 | 393 | """ 394 | 395 | 396 | prompt_for_plan_refinement = """You are a programmer tasked with generating appropriate plan to solve a given problem using the **{language}** programming language. You already have a wrong plan. Correct it so that it can generate correct code. 397 | 398 | {problem_with_planning} 399 | 400 | ## Plan Critique 401 | 402 | {critique} 403 | 404 | **Expected Output:** 405 | 406 | Your response must be structured as follows: 407 | 408 | ## New Plan 409 | 410 | - Write down a detailed, step-by-step modified plan to solve the **original problem**. 411 | - Ensure each step logically follows from the previous one. 412 | 413 | -------- 414 | **Important Instruction:** 415 | - Your response must contain only the plan. 416 | - Do not add any explanation. 417 | - Do not generate code. 418 | """ 419 | 420 | 421 | 422 | prompt_for_code_generation = """You are a programmer tasked with solving a given problem using the **{language}** programming language. See the plan to solve the plan and implement code to solve it. 423 | 424 | {problem_with_planning} 425 | 426 | -------- 427 | **Important Instructions:** 428 | - Do not add any explanation. 429 | - The generated **{language}** code must be inside a triple backtick (```) code block. 430 | {std_input_prompt}""" 431 | 432 | 433 | prompt_for_debugging = """You are a programmer who has received a solution of a problem written in **{language}** that fails to pass certain test cases. Your task is to modify the code in such a way so that it can pass all the test cases. Do not generate same code. 434 | 435 | {problem_with_planning} 436 | 437 | ### Buggy Code 438 | ```{language} 439 | {code} 440 | ``` 441 | 442 | ### Test Report 443 | 444 | {test_log} 445 | 446 | **Expected Output:** 447 | 448 | Your response must be structured as follows: 449 | 450 | ### Simulation with failed test case 451 | To detect where is the bug: 452 | - Take a sample test case where it fails. 453 | - Take the input go through each step according to the plan 454 | - You will get a output that must be different from the expected output. 455 | 456 | ### Debugging Notes 457 | Based on this simulation detect any of the following cases: 458 | - Plan is wrong 459 | - Plan is correct but plan to code generation is wrong. 460 | 461 | - Finally, discuss how to correct this code. 462 | 463 | ### Modified Code 464 | 465 | ```{language} 466 | # Your corrected code, with comments explaining each correction. 467 | ``` 468 | 469 | -------- 470 | **Important Instructions:** 471 | - Strictly follow the instructions. 472 | - Do not add testing code for example assert statement in your code. 473 | - Do not be overconfident that the generated code is correct. It is wrong. 474 | - The modified **{language}** code must be enclosed within triple backticks (```). 475 | - Your response must contain **Simulation with failed test case**, **Debugging Notes**, and **Modified Code** section. 476 | {std_input_prompt}""" 477 | 478 | -------------------------------------------------------------------------------- /src/results/Results.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from utils.jsonl import read_jsonl, write_jsonl, append_in_jsonl 4 | 5 | """ 6 | In this file, we define the Results class, 7 | which is used to store the results of the simulation. 8 | 9 | It will take a result path at first and after each 10 | simulation, it will save the results in that path. 11 | 12 | Results are in the form of a list of dictionaries 13 | and will be saved as a jsonl file. 14 | """ 15 | 16 | 17 | class Results(object): 18 | def __init__( 19 | self, 20 | result_path: str, 21 | discard_previous_run: bool = False 22 | ): 23 | self.result_path = result_path 24 | self.discard_previous_run = discard_previous_run 25 | self.load_results() 26 | 27 | def add_result(self, result: dict): 28 | self.results.append(result) 29 | self.append_results(result) 30 | 31 | def append_results(self, result): 32 | append_in_jsonl(self.result_path, result) 33 | 34 | def save_results(self): 35 | write_jsonl(self.result_path, self.results) 36 | 37 | def load_results(self): 38 | if os.path.exists(self.result_path): 39 | if self.discard_previous_run: 40 | os.remove(self.result_path) 41 | else: 42 | self.results = read_jsonl(self.result_path) 43 | else: 44 | self.results = [] 45 | 46 | def get_results(self): 47 | return self.results 48 | 49 | def __len__(self): 50 | return len(self.results) 51 | 52 | def __getitem__(self, idx): 53 | return self.results[idx] 54 | -------------------------------------------------------------------------------- /src/results/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/src/results/__init__.py -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kagnlp/CodeGenerator/363454b0e513bdd6a36e6349b5bffb118e500058/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/evaluateET.py: -------------------------------------------------------------------------------- 1 | from utils.jsonl import read_jsonl, write_jsonl 2 | from evaluations.func_evaluate import evaluate_io_et 3 | import os 4 | 5 | 6 | def generate_et_dataset_human( 7 | NORMAL_RESULTS_PATH, 8 | ET_RESULTS_PATH, 9 | ET_DATA_PATH="data/HumanEval/HumanEvalET.jsonl" 10 | ): 11 | dataset = read_jsonl(ET_DATA_PATH) 12 | data_dict = {} 13 | for item in dataset: 14 | data_dict[item["task_id"]] = {"et_item": item} 15 | 16 | results = read_jsonl(NORMAL_RESULTS_PATH) 17 | for result in results: 18 | data_dict[result["task_id"]]["result"] = result 19 | 20 | correct_count = 0 21 | et_results = [] 22 | for key, value in data_dict.items(): 23 | item = value["et_item"] 24 | result = value["result"] 25 | generated_code = result["source_codes"][0] if "source_codes" in result else result["solution"] 26 | 27 | passed = evaluate_io_et( 28 | item['test_case_list'], 29 | generated_code, 30 | prompt=item["prompt"] 31 | ) 32 | 33 | if passed: 34 | result["is_solved"] = True 35 | correct_count += 1 36 | else: 37 | result["is_solved"] = False 38 | 39 | et_results.append(result) 40 | 41 | print(f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}") 42 | # write_jsonl(ET_RESULTS_PATH, et_results) 43 | 44 | et_results = sorted( 45 | et_results, 46 | key=lambda x: int(x["task_id"].split('/')[-1]) 47 | ) 48 | 49 | write_jsonl(ET_RESULTS_PATH, et_results) 50 | print( 51 | f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}") 52 | 53 | 54 | def generate_et_dataset_mbpp( 55 | NORMAL_RESULTS_PATH, 56 | ET_RESULTS_PATH, 57 | ET_DATA_PATH="data/MBPPEval/MBPP_ET.jsonl" 58 | ): 59 | dataset = read_jsonl(ET_DATA_PATH) 60 | data_dict = {} 61 | for item in dataset: 62 | data_dict[item["task_id"]] = {"et_item": item} 63 | 64 | results = read_jsonl(NORMAL_RESULTS_PATH) 65 | for result in results: 66 | task_id = int(result["name"].split("_")[1]) 67 | data_dict[task_id]["result"] = result 68 | 69 | correct_count = 0 70 | et_results = [] 71 | for key, value in data_dict.items(): 72 | item = value["et_item"] 73 | result = value.get("result", None) 74 | if result is None: 75 | continue 76 | 77 | generated_code = result["source_codes"][0] if "source_codes" in result else result["solution"] 78 | 79 | passed = evaluate_io_et( 80 | item['test_list'], 81 | generated_code 82 | ) 83 | 84 | if passed: 85 | result["is_solved"] = True 86 | correct_count += 1 87 | else: 88 | result["is_solved"] = False 89 | 90 | et_results.append(result) 91 | print( 92 | f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}") 93 | # write_jsonl(ET_RESULTS_PATH, et_results) 94 | 95 | et_results = sorted( 96 | et_results, 97 | key=lambda x: int(x["name"].split("_")[1]) 98 | ) 99 | 100 | write_jsonl(ET_RESULTS_PATH, et_results) 101 | print( 102 | f"Accuracy: {correct_count}/{len(et_results)} = {correct_count/len(et_results):.2f}") 103 | 104 | 105 | -------------------------------------------------------------------------------- /src/utils/generateEP.py: -------------------------------------------------------------------------------- 1 | from utils.jsonl import read_jsonl, write_jsonl 2 | from evaluations.func_evaluate import evaluate_io_et 3 | import os 4 | 5 | 6 | def generate_ep_dataset_human( 7 | NORMAL_RESULTS_PATH, 8 | EP_SAMPLES_PATH, 9 | ): 10 | samples = [] 11 | results = read_jsonl(NORMAL_RESULTS_PATH) 12 | for result in results: 13 | completion = result["source_codes"][-1] 14 | 15 | if "from typing import *" not in completion: 16 | completion = "from typing import *\n" + completion 17 | 18 | samples.append( 19 | { 20 | "task_id": result["task_id"], 21 | "solution": completion, 22 | # "completion": result["solution"] 23 | } 24 | ) 25 | 26 | write_jsonl(EP_SAMPLES_PATH, samples) 27 | 28 | 29 | mbpp_not_included_set = set([ 30 | "Mbpp/304", "Mbpp/393", "Mbpp/399", "Mbpp/401", "Mbpp/408", 31 | "Mbpp/411", "Mbpp/417", "Mbpp/434", "Mbpp/443", "Mbpp/444", 32 | "Mbpp/452", "Mbpp/464", "Mbpp/584", "Mbpp/617", "Mbpp/625", 33 | "Mbpp/627", "Mbpp/738", "Mbpp/747", "Mbpp/756", "Mbpp/776", 34 | "Mbpp/802", "Mbpp/228", "Mbpp/291" 35 | ]) 36 | 37 | def generate_ep_dataset_mbpp( 38 | NORMAL_RESULTS_PATH, 39 | EP_SAMPLES_PATH, 40 | ): 41 | samples = [] 42 | results = read_jsonl(NORMAL_RESULTS_PATH) 43 | for result in results: 44 | completion = result["source_codes"][-1] 45 | task_id = "Mbpp/" + result["name"].split("_")[1] 46 | if task_id in mbpp_not_included_set: 47 | continue 48 | 49 | if "from typing import *" not in completion: 50 | completion = "from typing import *\n" + completion 51 | 52 | samples.append( 53 | { 54 | "task_id": task_id, 55 | "solution": completion 56 | } 57 | ) 58 | 59 | write_jsonl(EP_SAMPLES_PATH, samples) 60 | 61 | -------------------------------------------------------------------------------- /src/utils/jsonl.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | # Read an jsonl file and convert it into a python list of dictionaries. 5 | def read_jsonl(filename): 6 | """Reads a jsonl file and yields each line as a dictionary""" 7 | lines = [] 8 | with open(filename, "r", encoding="utf-8") as file: 9 | for line in file: 10 | lines.append(json.loads(line)) 11 | return lines 12 | 13 | # Write a python list of dictionaries into a jsonl file 14 | def write_jsonl(filename, lines): 15 | """Writes a python list of dictionaries into a jsonl file""" 16 | 17 | if not os.path.exists(os.path.dirname(filename)): 18 | os.makedirs(os.path.dirname(filename)) 19 | 20 | with open(filename, mode="w", encoding="utf-8") as file: 21 | for line in lines: 22 | file.write(json.dumps(line) + "\n") 23 | 24 | 25 | def append_in_jsonl(filename, line): 26 | """Appends a python dictionaries into a jsonl file""" 27 | 28 | if not os.path.exists(os.path.dirname(filename)): 29 | os.makedirs(os.path.dirname(filename)) 30 | 31 | with open(filename, mode="a", encoding="utf-8") as file: 32 | file.write(json.dumps(line) + "\n") 33 | 34 | -------------------------------------------------------------------------------- /src/utils/parse.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | """ 4 | Retriving the code blocks from the response. 5 | """ 6 | def parse_response(response: str) -> str: 7 | 8 | if '' in response and '' in response: 9 | response = response.split('')[1] 10 | 11 | if response is None: 12 | return '' 13 | 14 | if "```" not in response: 15 | return response 16 | 17 | code_pattern = r'```((.|\n)*?)```' 18 | if "```Python" in response: 19 | code_pattern = r'```Python((.|\n)*?)```' 20 | if "```Python3" in response: 21 | code_pattern = r'```Python3((.|\n)*?)```' 22 | if "```python" in response: 23 | code_pattern = r'```python((.|\n)*?)```' 24 | if "```python3" in response: 25 | code_pattern = r'```python3((.|\n)*?)```' 26 | if "```C" in response: 27 | code_pattern = r'```C((.|\n)*?)```' 28 | if "```c" in response: 29 | code_pattern = r'```c((.|\n)*?)```' 30 | if "```C++" in response: 31 | code_pattern = r'```C\+\+((.|\n)*?)```' 32 | if "```c++" in response: 33 | code_pattern = r'```c\+\+((.|\n)*?)```' 34 | if "```cpp" in response: 35 | code_pattern = r'```cpp((.|\n)*?)```' 36 | if "```Cpp" in response: 37 | code_pattern = r'```Cpp((.|\n)*?)```' 38 | if "```Java" in response: 39 | code_pattern = r'```Java((.|\n)*?)```' 40 | if "```java" in response: 41 | code_pattern = r'```java((.|\n)*?)```' 42 | if "```Node" in response: 43 | code_pattern = r'```Node((.|\n)*?)```' 44 | if "```node" in response: 45 | code_pattern = r'```node((.|\n)*?)```' 46 | if "```Rust" in response: 47 | code_pattern = r'```Rust((.|\n)*?)```' 48 | if "```rust" in response: 49 | code_pattern = r'```rust((.|\n)*?)```' 50 | if "```PHP" in response: 51 | code_pattern = r'```PHP((.|\n)*?)```' 52 | if "```php" in response: 53 | code_pattern = r'```php((.|\n)*?)```' 54 | if "```Go" in response: 55 | code_pattern = r'```Go((.|\n)*?)```' 56 | if "```go" in response: 57 | code_pattern = r'```go((.|\n)*?)```' 58 | if "```Ruby" in response: 59 | code_pattern = r'```Ruby((.|\n)*?)```' 60 | if "```ruby" in response: 61 | code_pattern = r'```ruby((.|\n)*?)```' 62 | if "```C#" in response: 63 | code_pattern = r'```C#((.|\n)*?)```' 64 | if "```c#" in response: 65 | code_pattern = r'```c#((.|\n)*?)```' 66 | if "```csharp" in response: 67 | code_pattern = r'```csharp((.|\n)*?)```' 68 | 69 | code_blocks = re.findall(code_pattern, response, re.DOTALL) 70 | 71 | if type(code_blocks[-1]) == tuple or type(code_blocks[-1]) == list: 72 | code_str = "\n".join(code_blocks[-1]) 73 | elif type(code_blocks[-1]) == str: 74 | code_str = code_blocks[-1] 75 | else: 76 | code_str = response 77 | 78 | return code_str.strip() 79 | 80 | 81 | """ 82 | Taking plan in numbered list format and return the list of plans. 83 | Exmaple: 84 | Input: 85 | 1. Plan A 86 | 2. Plan B 87 | 3. Plan C 88 | Output: ["Plan A", "Plan B", "Plan C"] 89 | """ 90 | def extract_plans(planing: str) -> list[str]: 91 | plans = [] 92 | for line in planing.strip().split("\n"): 93 | splits = line.split(". ") 94 | if len(splits) < 2: 95 | continue 96 | if splits[0].isnumeric(): 97 | plans.append(splits[1]) 98 | 99 | return plans -------------------------------------------------------------------------------- /src/utils/runEP.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | windows = False 4 | 5 | def run_eval_plus(RESULTS_PATH, SUMMARY_PATH, DATASET): 6 | 7 | if windows: 8 | command = f"wsl docker run -v /mnt/c/Users/CSE2/Desktop/CodeSIM:/app ganler/evalplus:latest --dataset {DATASET} --samples /app/{RESULTS_PATH} > C:/Users/CSE2/Desktop/CodeSIM/{SUMMARY_PATH}\n" 9 | 10 | with open("temp.bat", mode="w", encoding="utf-8") as file: 11 | file.write(command) 12 | 13 | try: 14 | result = subprocess.run(["temp.bat"], shell=True) 15 | # Print the output and error (if any) 16 | print("Output:\n", result.stdout) 17 | print("Error:\n", result.stderr) 18 | except Exception as e: 19 | print("Error Occured") 20 | print(e) 21 | else: 22 | command = f"docker run -v /home/ashraful/prompting/CodeSIM:/app ganler/evalplus:latest --dataset {DATASET} --samples /app/{RESULTS_PATH} > /home/ashraful/prompting/CodeSIM/{SUMMARY_PATH}\n" 23 | 24 | with open("temp.sh", mode="w", encoding="utf-8") as file: 25 | file.write(command) 26 | 27 | try: 28 | result = subprocess.run([command], shell=True) 29 | # Print the output and error (if any) 30 | print("Output:\n", result.stdout) 31 | print("Error:\n", result.stderr) 32 | except Exception as e: 33 | print("Error Occured") 34 | print(e) 35 | -------------------------------------------------------------------------------- /src/utils/summary.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | from utils.jsonl import read_jsonl, write_jsonl 4 | 5 | 6 | def gen_summary(results_path: str, summary_path: str): 7 | results = pd.DataFrame(read_jsonl(results_path)) 8 | 9 | if "api_calls" not in results: 10 | results["api_calls"] = 1 11 | 12 | solved = len(results.query("is_solved == True")) 13 | unsolved = len(results.query("is_solved == False")) 14 | 15 | accuracy = solved / (solved + unsolved) 16 | 17 | # normal_solved = len(results.query("is_solved == True & api_calls == 2")) 18 | # our_solved = len(results.query("is_solved == True & api_calls > 2")) 19 | 20 | total_prompt_tokens = results['run_details'].apply(lambda x: sum(run['prompt_tokens'] for run in x)).sum() 21 | total_completion_tokens = results['run_details'].apply(lambda x: sum(run['completion_tokens'] for run in x)).sum() 22 | total_taken_time = results['run_details'].apply(lambda x: sum(run['taken_time'] for run in x)).sum() 23 | # total_cost = results['run_details'].apply(lambda x: sum(run['cost'] for run in x)).sum() 24 | 25 | average_prompt_tokens = total_prompt_tokens / len(results) 26 | average_completion_tokens = total_completion_tokens / len(results) 27 | average_taken_time = total_taken_time / len(results) 28 | 29 | total_api_calls = results['run_details'].apply(lambda x: sum(run['api_calls'] for run in x)).sum() 30 | max_api_calls = results['run_details'].apply(lambda x: sum(run['api_calls'] for run in x)).max() 31 | min_api_calls = results['run_details'].apply(lambda x: sum(run['api_calls'] for run in x)).min() 32 | average_api_calls = total_api_calls / len(results) 33 | 34 | false_results = results.query("is_solved == False")['run_details'].apply(lambda x: sum(run['api_calls'] for run in x)).value_counts() 35 | true_results = results.query("is_solved == True")['run_details'].apply(lambda x: sum(run['api_calls'] for run in x)).value_counts() 36 | 37 | with open(summary_path, mode="w", encoding="utf-8") as summary_file: 38 | # Define a width for alignment 39 | name_width = 30 40 | value_width = 10 41 | 42 | summary_file.write(f"{'Accuracy:':<{name_width}} {accuracy*100:>{value_width}.01f}\n") 43 | summary_file.write(f"{'Solved:':<{name_width}} {solved:>{value_width}}\n") 44 | summary_file.write(f"{'Unsolved:':<{name_width}} {unsolved:>{value_width}}\n") 45 | # summary_file.write(f"\n") 46 | # summary_file.write(f"{'Normal Solved:':<{name_width}} {normal_solved:>{value_width}}\n") 47 | # summary_file.write(f"{'Our Solved:':<{name_width}} {our_solved:>{value_width}}\n") 48 | summary_file.write(f"\n") 49 | summary_file.write(f"\n") 50 | summary_file.write(f"{'Total Prompt Tokens:':<{name_width}} {total_prompt_tokens:>{value_width}}\n") 51 | summary_file.write(f"{'Average Prompt Tokens:':<{name_width}} {average_prompt_tokens:>{value_width}.0f}\n") 52 | summary_file.write(f"\n") 53 | summary_file.write(f"{'Total Completion Tokens:':<{name_width}} {total_completion_tokens:>{value_width}}\n") 54 | summary_file.write(f"{'Average Completion Tokens:':<{name_width}} {average_completion_tokens:>{value_width}.0f}\n") 55 | summary_file.write(f"\n") 56 | summary_file.write(f"{'Total Taken Time:':<{name_width}} {total_taken_time:>{value_width}.02f}s\n") 57 | summary_file.write(f"{'Average Taken Time:':<{name_width}} {average_taken_time:>{value_width}.02f}s\n") 58 | summary_file.write(f"\n") 59 | # summary_file.write(f"{'Total Cost:':<{name_width}} {total_cost:>{value_width}.02f}\n") 60 | summary_file.write(f"\n") 61 | summary_file.write(f"{'Total Api Calls:':<{name_width}} {total_api_calls:>{value_width}.02f}\n") 62 | summary_file.write(f"{'Max Api Calls:':<{name_width}} {max_api_calls:>{value_width}}\n") 63 | summary_file.write(f"{'Min Api Calls:':<{name_width}} {min_api_calls:>{value_width}}\n") 64 | summary_file.write(f"{'Average Api Calls:':<{name_width}} {average_api_calls:>{value_width}.02}\n") 65 | summary_file.write(f"\n") 66 | summary_file.write(f"\n") 67 | summary_file.write(f"{'Solved Api Calls':<{name_width}}\n") 68 | summary_file.write(f"{'Api calls':<{name_width}} {'Solved':>{value_width}}\n") 69 | # Printing all keys and their values (Solved) 70 | for key, value in true_results.items(): 71 | summary_file.write(f"{key:<{name_width}} {value:>{value_width}}\n") 72 | summary_file.write(f"\n") 73 | summary_file.write(f"{'Unsolved Api Calls':<{name_width}}\n") 74 | summary_file.write(f"{'Api calls':<{name_width}} {'Unsolved':>{value_width}}\n") 75 | # Printing all keys and their values (Unsolved) 76 | for key, value in false_results.items(): 77 | summary_file.write(f"{key:<{name_width}} {value:>{value_width}}\n") 78 | 79 | -------------------------------------------------------------------------------- /src/utils/tokenCount.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | 4 | 5 | def token_count(messages, model="gpt-3.5-turbo"): 6 | encoding = tiktoken.encoding_for_model('gpt-3.5-turbo') 7 | # if model == "gpt-3.5-turbo" or model == "gpt" or model == "gpt-35-instant": 8 | # # print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.") 9 | # return token_count(messages, model="gpt-3.5-turbo-0301") 10 | # elif model == "gpt-4": 11 | # # print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.") 12 | # return token_count(messages, model="gpt-4-0314") 13 | 14 | # if "gpt-3.5" in model: 15 | # # every message follows <|start|>{role/name}\n{content}<|end|>\n 16 | # tokens_per_message = 4 17 | # tokens_per_name = -1 # if there's a name, the role is omitted 18 | # elif "gpt-4" in model: 19 | # tokens_per_message = 3 20 | # tokens_per_name = 1 21 | # else: 22 | # raise NotImplementedError( 23 | # f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") 24 | 25 | tokens_per_message = 4 26 | tokens_per_name = 1 27 | 28 | num_tokens = 0 29 | for message in messages: 30 | num_tokens += tokens_per_message 31 | for key, value in message.items(): 32 | num_tokens += len(encoding.encode(value)) 33 | if key == "name": 34 | num_tokens += tokens_per_name 35 | num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> 36 | return num_tokens 37 | --------------------------------------------------------------------------------