├── apps ├── eval │ ├── __init__.py │ └── apps_metric.py ├── eval.sh ├── make_demonstration.py ├── filter.py ├── utils.py ├── eval.py ├── sc2tmc.py ├── icl.sh ├── data │ ├── 2shot_demonstration_101seed.json │ ├── 2shot_demonstration_27seed.json │ └── 2shot_demonstration_42seed.json └── icl.py ├── README.md └── codecontests ├── icl_corr.sh ├── evaluate_corr.sh ├── ppl.sh ├── evaluate.sh ├── preprocess_original_dataset_ft.py ├── icl_gpt.sh ├── ft.sh ├── preprocess_original_dataset_icl.py ├── ppl.py ├── evaluate_.py ├── evaluate_gpt.py ├── utils └── utils_evaluate.py ├── calculate_corr.ipynb ├── evaluate_ft.py ├── evaluate_corr.py ├── construct_mc_sc_divided_dataset.ipynb ├── icl_ft.sh ├── data ├── monolithic_2shot_demonstration_169seed.jsonl ├── monolithic_2shot_demonstration_134seed.jsonl └── monolithic_2shot_demonstration_42seed.jsonl ├── construct_demonstration_for_correlation_experiment.ipynb ├── calculate_corr_between_mos_and_function_call.ipynb ├── sc2mc.py ├── icl_corr.py ├── icl.py └── icl_ft.py /apps/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Revisiting the Impact of Pursuing Modularity for Code Generation 2 | Official Repository for "Revisiting the Impact of Pursuing Modularity for Code Generation" [[Paper(arXiv)](https://arxiv.org/abs/2407.11406)] 3 | 4 | Deokyeong Kang, Kijung Seo, Taeuk Kim. _**Accepted to EMNLP 2024 Findings**_ 5 | 6 | 7 | ## Contents 8 | 9 | * apps: source codes for APPS dataset 10 | * In-Context Learning: icl.py 11 | * codecontests: source codes for CodeContests dataset 12 | * Modularity score (MoS) metric: utils/utils.py 13 | * In-Context Learning: icl.py 14 | * Fine-tuning: ft.py 15 | * Correlation experiment: icl_corr.py 16 | * Perplexity experiment: ppl.py 17 | -------------------------------------------------------------------------------- /codecontests/icl_corr.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # inference for correlation experiment 4 | # CL 7b, pass@1(n=10) 5 | seed=42 6 | size=7 7 | model=meta-llama/CodeLlama-${size}b-hf 8 | # size=6.7 9 | # model=deepseek-ai/deepseek-coder-${size}b-base 10 | num_gpu=4 11 | dtype=float16 12 | num_icl_shot=1 13 | num_gen=10 14 | temperature=0.1 15 | swap_space=8 16 | for metric in var_len; do 17 | CUDA_VISIBLE_DEVICES=0,1,2,3 python icl_corr.py \ 18 | --seed ${seed} \ 19 | --model ${model} \ 20 | --num_gpu ${num_gpu} \ 21 | --dtype ${dtype} \ 22 | --num_icl_shot ${num_icl_shot} \ 23 | --num_gen ${num_gen} \ 24 | --temperature ${temperature} \ 25 | --max_new_token 1024 \ 26 | --top_p 0.95 \ 27 | --swap_space ${swap_space} \ 28 | --metric ${metric} \ 29 | > log/inference/cl${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${metric}.log 2>&1 30 | echo cl${size}b ${metric} inference ends 31 | done 32 | 33 | -------------------------------------------------------------------------------- /codecontests/evaluate_corr.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # CL 4 | size=7 5 | model=meta-llama/CodeLlama-${size}b-hf 6 | num_icl_shot=1 7 | num_gen=10 8 | temperature=0.1 9 | k=1 10 | 11 | for metric in var_len; do 12 | python evaluate_corr.py \ 13 | --model ${model} \ 14 | --num_icl_shot ${num_icl_shot} \ 15 | --num_gen ${num_gen} \ 16 | --temperature ${temperature} \ 17 | --metric ${metric} \ 18 | --k ${k} \ 19 | > log/evaluation/cl${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${metric}_corr.log 2>&1 20 | echo cl${size}b ${metric} score correlation evaluation ends 21 | done 22 | 23 | 24 | # # DS 25 | # size=6.7 26 | # model=deepseek-ai/deepseek-coder-${size}b-base 27 | # num_gpu=1 28 | # num_icl_shot=1 29 | # num_gen=10 30 | # temperature=0.1 31 | # k=1 32 | 33 | # for metric in style modularity; do 34 | # python evaluate_corr.py \ 35 | # --model ${model} \ 36 | # --num_icl_shot ${num_icl_shot} \ 37 | # --num_gen ${num_gen} \ 38 | # --temperature ${temperature} \ 39 | # --metric ${metric} \ 40 | # --k ${k} \ 41 | # > log/evaluation/ds${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${metric}_corr.log 2>&1 42 | # echo ds${size}b ${metric} score correlation evaluation ends 43 | # done -------------------------------------------------------------------------------- /apps/eval.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | temperature=0.1 4 | code_type_=(mc sc tmc tsc) 5 | model_name=$1 6 | if [ ${model_name} == deepseek ]; then 7 | model=deepseek-ai/deepseek-coder-6.7b-base 8 | else 9 | model=meta-llama/CodeLlama-7b-hf 10 | fi 11 | 12 | task() { 13 | local seed=$1 14 | for code_type in ${code_type_}; do 15 | python -u eval.py --seed ${seed} \ 16 | --model ${model} --num_icl_shot 2 \ 17 | --num_gen 10 --code_type ${code_type} \ 18 | --temperature ${temperature} --modify original \ 19 | > log/evaluation/meta-llama-CodeLlama-7b-hf_${codetype}_original_${num_icl_shot}shot_10gen_${temperature}temp_${seed}.log 2>&1 20 | done 21 | task_completed $seed 22 | } 23 | 24 | task_completed() { 25 | local seed=$1 26 | # Start task1 for the next seed 27 | next_seed=$(next_seed $seed) 28 | if [ -n "$next_seed" ]; then 29 | task1 $next_seed & 30 | fi 31 | } 32 | 33 | 34 | next_seed() { 35 | local seed=$1 36 | case $seed in 37 | 27) echo 42 ;; 38 | 42) echo 101 ;; 39 | 101) echo 134 ;; 40 | 134) echo 169 ;; 41 | 169) echo "" ;; 42 | esac 43 | } 44 | 45 | 46 | # Start the first tasks 47 | task 27 & 48 | 49 | # Wait for all background jobs to finish 50 | wait -------------------------------------------------------------------------------- /codecontests/ppl.sh: -------------------------------------------------------------------------------- 1 | (nohup python perplexity.py --gpu 0 --model meta-llama/CodeLlama-7b-hf --mod low --include_prompt > log/ppl_include_prompt/cl7b_low_mod.log 2>&1) & 2 | (nohup python perplexity.py --gpu 1 --model meta-llama/CodeLlama-7b-hf --mod high --include_prompt > log/ppl_include_prompt/cl7b_high_mod.log 2>&1) & 3 | (nohup python perplexity.py --gpu 2 --model deepseek-ai/deepseek-coder-6.7b-base --mod low --include_prompt > log/ppl_include_prompt/ds7b_low_mod.log 2>&1) & 4 | (nohup python perplexity.py --gpu 3 --model deepseek-ai/deepseek-coder-6.7b-base --mod high --include_prompt > log/ppl_include_prompt/ds7b_high_mod.log 2>&1) & 5 | wait && 6 | echo 7b model done! 7 | (nohup python perplexity.py --gpu 0 --model meta-llama/CodeLlama-34b-hf --mod low --include_prompt > log/ppl_include_prompt/cl34b_low_mod.log 2>&1) & 8 | (nohup python perplexity.py --gpu 1 --model meta-llama/CodeLlama-34b-hf --mod high --include_prompt > log/ppl_include_prompt/cl34b_high_mod.log 2>&1) & 9 | (nohup python perplexity.py --gpu 2 --model deepseek-ai/deepseek-coder-33b-base --mod low --include_prompt > log/ppl_include_prompt/ds33b_low_mod.log 2>&1) & 10 | (nohup python perplexity.py --gpu 3 --model deepseek-ai/deepseek-coder-33b-base --mod high --include_prompt > log/ppl_include_prompt/ds33b_high_mod.log 2>&1) & 11 | echo 33b model in progress! -------------------------------------------------------------------------------- /apps/make_demonstration.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | import os 5 | from datasets import Dataset 6 | from collections import defaultdict 7 | 8 | 9 | def set_seed(seed): 10 | random.seed(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | torch.cuda.manual_seed(seed) 14 | # When running on the CuDNN backend, two further options must be set 15 | torch.backends.cudnn.deterministic = True 16 | torch.backends.cudnn.benchmark = False 17 | # Set a fixed value for the hash seed 18 | os.environ["PYTHONHASHSEED"] = str(seed) 19 | 20 | 21 | def extract_demonstration(train_dataset): 22 | demonstration = defaultdict(list) 23 | for i in sorted(random.sample(list(range(len(train_dataset))), 2)): 24 | data = train_dataset[i] 25 | if data["starter_code"] != "": 26 | question_guide = "use the provided function signature" 27 | else: 28 | question_guide = "read from and write to standard IO" 29 | sc_instruction = ( 30 | "Write a python code to solve the following coding problem " 31 | "that obeys the constraints and passes the example test cases. " 32 | f"The output code needs to {question_guide}. " 33 | "Please wrap your code answer using ```:" 34 | ) 35 | mc_instruction = ( 36 | "Write a python code to solve the following coding problem " 37 | "that obeys the constraints and passes the example test cases. " 38 | f"The output code needs to {question_guide}. " 39 | "Ensure modularity of the python code by dividing the code into smaller, " 40 | "useful functions to solve the given problem. " 41 | "Please wrap your code answer using ```:" 42 | ) 43 | 44 | demonstration["problem_id"].append(data["problem_id"]) 45 | demonstration["problem_description"].append(data["question"].strip()) 46 | demonstration["starter_code"].append(data["starter_code"]) 47 | demonstration["sc_instruction"].append(sc_instruction) 48 | demonstration["mc_instruction"].append(mc_instruction) 49 | demonstration["sc"].append(data["sc"][0].strip()) 50 | demonstration["sc_cc"].append(data["sc_cc"][0]) 51 | demonstration["mc"].append(data["mc"][0].strip()) 52 | demonstration["mc_cc"].append(data["mc_cc"][0]) 53 | 54 | return demonstration 55 | 56 | 57 | for seed in [27, 42, 101, 134, 169]: 58 | set_seed(seed) 59 | dataset = Dataset.from_json("data/filtered_APPS.json") 60 | demonstration = extract_demonstration(dataset) 61 | Dataset.from_dict(demonstration).to_json( 62 | f"data/2shot_demonstration_{seed}seed.json" 63 | ) 64 | -------------------------------------------------------------------------------- /codecontests/evaluate.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # # CL 4 | # num_icl_shot=0 5 | # num_gen=1 6 | # temperature=0.1 7 | # k=1 8 | 9 | # for size in 34; do 10 | # for seed in 27 42 101 134 169; do 11 | # python evaluate_.py \ 12 | # --model meta-llama/CodeLlama-${size}b-hf \ 13 | # --seed ${seed} \ 14 | # --num_icl_shot ${num_icl_shot} \ 15 | # --num_gen ${num_gen} \ 16 | # --temperature ${temperature} \ 17 | # --k ${k} \ 18 | # > log/evaluation/2shot_mc/cl${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 19 | # echo cl${size}b ${num_icl_shot} mc ${seed}seed evaluation ends 20 | # done 21 | # done 22 | 23 | # # DS 24 | # num_icl_shot=2 25 | # num_gen=50 26 | # temperature=0.6 27 | # k=10 28 | # for size in 33; do 29 | # for seed in 27 42 101 134 169; do 30 | # python evaluate_.py \ 31 | # --model deepseek-ai-deepseek-coder-${size}b-base \ 32 | # --seed ${seed} \ 33 | # --num_icl_shot ${num_icl_shot} \ 34 | # --num_gen ${num_gen} \ 35 | # --temperature ${temperature} \ 36 | # --k ${k} \ 37 | # > log/evaluation/2shot_mc/ds${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 38 | # echo ds${size}b ${num_icl_shot} mc ${seed}seed evaluation ends 39 | # done 40 | # done 41 | 42 | 43 | # inference after fine-tuning 44 | num_icl_shot=0 45 | num_gen=50 46 | temperature=0.6 47 | k=1 48 | # degree=low 49 | debug_mode=0 50 | chkpt=_final 51 | 52 | for degree in low high; do 53 | for seed in 27; do 54 | python evaluate_ft.py \ 55 | --model meta-llama/CodeLlama-7b-hf \ 56 | --seed ${seed} \ 57 | --num_icl_shot ${num_icl_shot} \ 58 | --num_gen ${num_gen} \ 59 | --temperature ${temperature} \ 60 | --k ${k} \ 61 | --degree ${degree} \ 62 | --chkpt ${chkpt} \ 63 | > log/evaluation/tmp/CodeLlama_${degree}_mod_chkpt${chkpt}_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 & 64 | done 65 | done 66 | 67 | # --model meta-llama/CodeLlama-7b-hf \ 68 | # --model /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp 69 | 70 | 71 | # # gpt 72 | # num_icl_shot=2 73 | # num_gen=10 74 | # temperature=0.1 75 | # k=1 76 | 77 | # for code_type in monolithic modular transformed_modular transformed_monolithic; do 78 | # for seed in 134; do 79 | # python evaluate_gpt.py \ 80 | # --model gpt-4o-mini \ 81 | # --code_type ${code_type} \ 82 | # --seed ${seed} \ 83 | # --num_icl_shot ${num_icl_shot} \ 84 | # --num_gen ${num_gen} \ 85 | # --temperature ${temperature} \ 86 | # --k ${k} \ 87 | # >> log/evaluation/gpt/gpt-4o-mini_${code_type}_code_${num_icl_shot}shot_${temperature}temp_${num_gen}gen.log 2>&1 88 | # done 89 | # done 90 | -------------------------------------------------------------------------------- /apps/filter.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, concatenate_datasets 2 | import json 3 | from utils import * 4 | from radon.complexity import cc_visit 5 | from eval.apps_metric import apps_metric 6 | import os 7 | 8 | def filtering(dataset): 9 | words = ["codeforces", "atcoder", "codechef"] 10 | dataset = dataset.filter(lambda x: any(word in x["url"] for word in words)) 11 | dataset = make_solution_column(dataset) 12 | 13 | if os.path.exists( 14 | "data/apps_results.json" 15 | ): 16 | results = json.load( 17 | open( 18 | "data/apps_results.json", 19 | "r", 20 | ) 21 | ) 22 | else: 23 | eval_apps = apps_metric() 24 | results, _ = eval_apps._compute( 25 | dataset, k_list=[1], split="train", column_name="solution" 26 | ) 27 | json.dump( 28 | results, 29 | open( 30 | "data/apps_results.json", 31 | "w", 32 | ), 33 | ) 34 | 35 | data = [] 36 | for index in results: 37 | sc = [] 38 | sc_cc = [] 39 | mc = [] 40 | mc_cc = [] 41 | cc_criteria = 10 42 | for i, result in enumerate(results[index]): 43 | try: 44 | code = process_text(dataset[int(index)]["solution"][i]) 45 | code_cc = get_avg_cc(code) 46 | if all(x == True for x in result): 47 | if code_cc >= cc_criteria: 48 | sc.append(code) 49 | sc_cc.append(code_cc) 50 | else: 51 | visit = cc_visit(code) 52 | count = [ 53 | count_module_written(code, func.name) 54 | for func in visit.functions 55 | ] 56 | TF = all(x >= 2 for x in count) 57 | if len(count) >= 3 and TF: 58 | mc.append(code) 59 | mc_cc.append(code_cc) 60 | except: 61 | pass 62 | data.append({"mc": mc, "mc_cc": mc_cc, "sc": sc, "sc_cc": sc_cc}) 63 | 64 | final_data = concatenate_datasets([dataset, Dataset.from_list(data)], axis=1) 65 | final_data = final_data.filter( 66 | lambda x: x["sc"] != [] 67 | and x["mc"] != [] 68 | and -10 not in x["sc_cc"] 69 | and -10 not in x["mc_cc"] 70 | ) 71 | 72 | return final_data 73 | 74 | 75 | def main(): 76 | 77 | dataset_name = "codeparrot/apps" 78 | 79 | dataset = load_dataset( 80 | dataset_name, 81 | trust_remote_code=True, 82 | split="train", 83 | ) 84 | 85 | filtered_dataset = filtering(dataset) 86 | filtered_dataset.to_json( 87 | f"data/filtered_APPS.json" 88 | ) 89 | 90 | 91 | if __name__ == "__main__": 92 | main() 93 | -------------------------------------------------------------------------------- /codecontests/preprocess_original_dataset_ft.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from datasets import load_dataset 4 | from utils.utils_evaluate import safe_eval_answer_from_agent_ft 5 | from utils.utils import get_code_modularity_score 6 | 7 | 8 | # delete all solutions in another language except python in the dataset 9 | def leave_python_solution(example): 10 | solutions = example['solutions']['solution'] 11 | language_index = example['solutions']['language'] 12 | 13 | python_solution = [] 14 | for i, lang in enumerate(language_index): 15 | if lang == 3: # python3 16 | python_solution.append(solutions[i]) 17 | 18 | example['solutions']['solution'] = python_solution 19 | del example['solutions']['language'] 20 | return example 21 | 22 | 23 | # remove annotated parts in the code 24 | def remove_annotation(example): 25 | def remove_annotation_(input_string): 26 | modified_string = re.sub(r"#.*?(?=\n)", '', input_string) 27 | modified_string = re.sub(r"'''.*?'''", '', modified_string, flags=re.DOTALL) 28 | modified_string = re.sub(r'""".*?"""', '', modified_string, flags=re.DOTALL) 29 | return modified_string 30 | 31 | for i in range(len(example['solutions']['solution'])): 32 | example['solutions']['solution'][i] = remove_annotation_(example['solutions']['solution'][i]) 33 | 34 | return example 35 | 36 | 37 | def calculate_mos(example): 38 | scores = [] 39 | for code in example['solutions']['solution']: 40 | try: 41 | modularity_score = get_code_modularity_score(code.strip()) 42 | except: 43 | modularity_score = -1 44 | 45 | scores.append(modularity_score) 46 | 47 | example['solutions']['modularity'] = scores 48 | 49 | return example 50 | 51 | 52 | def start(split): 53 | base_dir = os.path.dirname(__file__) 54 | 55 | # load original dataset 56 | dataset = load_dataset("deepmind/code_contests") 57 | dataset = dataset[split] 58 | # dataset = dataset[split].select(range(5)) # for test 59 | print(f'len(dataset): {len(dataset)}') 60 | # 1. filter questions without any python solution 61 | print('1') 62 | dataset = dataset.filter(lambda example: 3 in example['solutions']['language']) 63 | # 2. retain only python solutions in problem 64 | print('2') 65 | dataset = dataset.map(leave_python_solution, num_proc=16) 66 | # 3. remove annotation in the code 67 | print('3') 68 | dataset = dataset.map(remove_annotation, num_proc=16) 69 | # 4. retain only python solutions that pass the test cases 70 | print('4') 71 | dataset = dataset.map(safe_eval_answer_from_agent_ft, num_proc=16) 72 | # 5. calculate MoS score of code 73 | print('5') 74 | dataset = dataset.map(calculate_mos, num_proc=16) 75 | # 6. save 76 | dataset.to_json(os.path.join(base_dir, 'data/ft', f'my_code_contests_{split}.jsonl')) 77 | 78 | 79 | # start('test') 80 | start('valid') 81 | # start('train') -------------------------------------------------------------------------------- /apps/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from radon.complexity import cc_visit 4 | 5 | def write_dict_to_jsonl(dictionary, filename): 6 | import json 7 | 8 | with open(filename, "a") as file: 9 | for item in dictionary: 10 | json.dump(item, file) 11 | file.write("\n") 12 | 13 | 14 | def read_jsonl_to_dict(filename): 15 | import json 16 | 17 | result = [] 18 | with open(filename, "r") as file: 19 | for line in file: 20 | item = json.loads(line.strip()) 21 | result.append(item) 22 | return result 23 | 24 | 25 | def count_module_written(code, module): 26 | indices = [] 27 | index = -1 28 | # find all parts starting with module name in the code 29 | while True: 30 | index = code.find(module, index + 1) 31 | if index == -1: 32 | break 33 | indices.append(index) 34 | 35 | # filter 36 | permit_left_char = [ 37 | " ", 38 | "(", 39 | ":", 40 | "+", 41 | "-", 42 | "*", 43 | "/", 44 | "//", 45 | "%", 46 | "=", 47 | "<", 48 | ">", 49 | "!", 50 | "~", 51 | "&", 52 | "|", 53 | "^", 54 | ] 55 | permit_right_char = [" ", "("] 56 | cnt = 0 57 | for index in indices: 58 | if ( 59 | code[index - 1] in permit_left_char 60 | and code[index + len(module)] in permit_right_char 61 | ): 62 | cnt += 1 63 | 64 | return cnt 65 | 66 | 67 | # calculate average cc of code 68 | def get_avg_cc(code): 69 | try: 70 | visitor = cc_visit(code) 71 | 72 | # 1. average cc of modules 73 | total_module_complexity = 0 74 | num_module = 0 75 | for module in visitor.blocks: 76 | # only consider function or method of class as module 77 | if module.__class__.__name__ == "Function": 78 | total_module_complexity += module.complexity 79 | num_module += 1 80 | 81 | # 2. cc of body code 82 | body_complexity = visitor.complexity 83 | 84 | # 3. average cc of the program 85 | avg_cc = (total_module_complexity + body_complexity) / (num_module + 1) 86 | except: 87 | # cc_visit fails to return because the input code has some errors 88 | avg_cc = -10 89 | 90 | return avg_cc 91 | 92 | 93 | def process_text(input_string): 94 | modified_string = re.sub(r"#.*?(?=\n)", "", input_string) 95 | modified_string = re.sub(r"'''.*?'''", "", modified_string, flags=re.DOTALL) 96 | modified_string = re.sub(r'""".*?"""', "", modified_string, flags=re.DOTALL) 97 | return modified_string 98 | 99 | 100 | def make_solution_column(dataset): 101 | solution = [] 102 | for problem in dataset: 103 | solution.append(json.loads(problem["solutions"])) 104 | dataset = dataset.add_column("solution", solution) 105 | return dataset -------------------------------------------------------------------------------- /codecontests/icl_gpt.sh: -------------------------------------------------------------------------------- 1 | # lets go 2 | model=gpt-4o-mini 3 | num_icl_shot=2 4 | num_gen=10 5 | temperature=0.1 6 | debug_mode=0 7 | 8 | # for code_type in monolithic; do 9 | # for seed in 27 42 101 134; do 10 | # python icl_gpt.py \ 11 | # --seed ${seed} \ 12 | # --model ${model} \ 13 | # --num_icl_shot ${num_icl_shot} \ 14 | # --num_gen ${num_gen} \ 15 | # --temperature ${temperature} \ 16 | # --max_new_token 1024 \ 17 | # --code_type ${code_type} \ 18 | # --debug_mode ${debug_mode} \ 19 | # > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 20 | # echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends 21 | # done 22 | # done 23 | 24 | # for code_type in modular; do 25 | # for seed in 27 42 101 134 169; do 26 | # python icl_gpt.py \ 27 | # --seed ${seed} \ 28 | # --model ${model} \ 29 | # --num_icl_shot ${num_icl_shot} \ 30 | # --num_gen ${num_gen} \ 31 | # --temperature ${temperature} \ 32 | # --max_new_token 1024 \ 33 | # --code_type ${code_type} \ 34 | # --debug_mode ${debug_mode} \ 35 | # > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 36 | # echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends 37 | # done 38 | # done 39 | 40 | # for code_type in transformed_monolithic; do 41 | # for seed in 27 42 101 134 169; do 42 | # python icl_gpt.py \ 43 | # --seed ${seed} \ 44 | # --model ${model} \ 45 | # --num_icl_shot ${num_icl_shot} \ 46 | # --num_gen ${num_gen} \ 47 | # --temperature ${temperature} \ 48 | # --max_new_token 1024 \ 49 | # --code_type ${code_type} \ 50 | # --debug_mode ${debug_mode} \ 51 | # > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 52 | # echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends 53 | # done 54 | # done 55 | 56 | for code_type in transformed_modular; do 57 | for seed in 27 42 134 169; do 58 | python icl_gpt.py \ 59 | --seed ${seed} \ 60 | --model ${model} \ 61 | --num_icl_shot ${num_icl_shot} \ 62 | --num_gen ${num_gen} \ 63 | --temperature ${temperature} \ 64 | --max_new_token 1024 \ 65 | --code_type ${code_type} \ 66 | --debug_mode ${debug_mode} \ 67 | > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 68 | echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends 69 | done 70 | done 71 | 72 | 73 | # # # for test 74 | # model=gpt-4o-mini 75 | # num_icl_shot=2 76 | # num_gen=10 77 | # temperature=0.1 78 | # debug_mode=0 79 | 80 | # for code_type in transformed_modular; do 81 | # for seed in 101; do 82 | # python icl_gpt.py \ 83 | # --seed ${seed} \ 84 | # --model ${model} \ 85 | # --num_icl_shot ${num_icl_shot} \ 86 | # --num_gen ${num_gen} \ 87 | # --temperature ${temperature} \ 88 | # --max_new_token 1024 \ 89 | # --code_type ${code_type} \ 90 | # --debug_mode ${debug_mode} \ 91 | # > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 92 | # echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends 93 | # done 94 | # done -------------------------------------------------------------------------------- /apps/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from eval.apps_metric import apps_metric 4 | from eval.utils import get_results 5 | import argparse 6 | from datasets import Dataset 7 | import re 8 | import os 9 | import argparse 10 | 11 | from tqdm import tqdm 12 | 13 | from utils import read_jsonl_to_dict, write_dict_to_jsonl 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--seed", type=int, default=42) 19 | parser.add_argument("--model", type=str, default="meta-llama/CodeLlama-7b-hf") 20 | parser.add_argument("--num_icl_shot", type=int, default=2) 21 | parser.add_argument( 22 | "--num_gen", 23 | type=int, 24 | default=10, 25 | help="number of solutions generated per problem", 26 | ) 27 | parser.add_argument("--code_type", type=str, default="sc") 28 | parser.add_argument( 29 | "--temperature", 30 | type=float, 31 | default=0.1, 32 | help="0 means greedy decoding for vllm", 33 | ) 34 | parser.add_argument("--k", type=int, default=1, help="k of pass@k") 35 | parser.add_argument( 36 | "--modify", 37 | type=str, 38 | default="original", 39 | help="modification method of the demonstration code", 40 | ) 41 | parser.add_argument( 42 | "--level", type=str, default="all", help="level of the evaluation" 43 | ) 44 | 45 | args = parser.parse_args() 46 | 47 | base_directory = os.path.dirname(__file__) 48 | file_name = f"{args.model.replace('/', '-')}_{args.code_type}_{args.modify}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 49 | 50 | data = read_jsonl_to_dict(os.path.join(base_directory, "result", file_name)) 51 | data = Dataset.from_list(data) 52 | 53 | if not os.path.exists( 54 | os.path.join(base_directory, "tf", file_name.replace("result.jsonl", "tf.json")) 55 | ): 56 | eval_apps = apps_metric() 57 | results, metrics = eval_apps._compute( 58 | data, 59 | k_list=[1, 5], 60 | level=args.level, 61 | split="test", 62 | column_name="extracted_solutions", 63 | ) 64 | json.dump( 65 | results, 66 | open( 67 | os.path.join( 68 | base_directory, "tf", file_name.replace("result.jsonl", "tf.json") 69 | ), 70 | "w", 71 | ), 72 | ) 73 | else: 74 | results = json.load(open(os.path.join(base_directory, "tf", file_name.replace("result.jsonl", "tf.json")),"r")) 75 | print("\n\n\nResults: pass@k on all level") 76 | get_results( 77 | data, 78 | k_list=[1, 5], 79 | ) 80 | 81 | 82 | results_list = [results[index] for index in results] 83 | passed_list = [] 84 | for results in results_list: 85 | for result in results: 86 | passed = [] 87 | for element in result: 88 | passed.append([int(element)]) 89 | passed_list.append(passed) 90 | data = data.add_column("passed", passed_list) 91 | for difficulty in ["introductory", "interview", "competition"]: 92 | print(f"\n\n\nResults: pass@k on {difficulty} level") 93 | get_results( 94 | data.filter(lambda x: x["difficulty"] == difficulty)["passed"], 95 | k_list=[1, 5], 96 | ) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /codecontests/ft.sh: -------------------------------------------------------------------------------- 1 | for degree in high; do 2 | python my_run_clm.py \ 3 | --model_name_or_path deepseek-ai/deepseek-coder-6.7b-base \ 4 | --train_file data/ft_final/my_code_contests_train_${degree}.jsonl \ 5 | --validation_file data/ft_final/my_code_contests_valid_${degree}.jsonl \ 6 | --output_dir tmp/deepseek/${degree} \ 7 | --save_steps 100 \ 8 | --logging_steps 30 \ 9 | --evaluation_strategy steps \ 10 | --max_eval_samples 50 \ 11 | --torch_dtype bfloat16 \ 12 | --block_size 2048 \ 13 | --preprocessing_num_workers 8 \ 14 | --trust_remote_code 1 \ 15 | --do_train \ 16 | --do_eval \ 17 | --learning_rate 5e-5 \ 18 | --num_train_epochs 1 \ 19 | --per_device_train_batch_size 4 \ 20 | --per_device_eval_batch_size 4 \ 21 | --gradient_accumulation_steps 16 \ 22 | --lr_scheduler_type cosine \ 23 | --warmup_ratio 0.01 \ 24 | --low_cpu_mem_usage True \ 25 | --overwrite_output_dir True \ 26 | --report_to wandb \ 27 | --run_name deepseekcoder-7b-${degree}-mod \ 28 | --resume_from_checkpoint /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/deepseek/high/checkpoint-200 29 | done 30 | 31 | # --max_train_samples 50 \ 32 | 33 | 34 | # # start from checkpoint 35 | # degree=low 36 | # python my_run_clm.py \ 37 | # --model_name_or_path meta-llama/CodeLlama-7b-hf \ 38 | # --train_file data/ft_final/my_code_contests_train_${degree}.jsonl \ 39 | # --validation_file data/ft_final/my_code_contests_valid_${degree}.jsonl \ 40 | # --output_dir tmp/CodeLlama \ 41 | # --save_steps 5 \ 42 | # --evaluation_strategy steps \ 43 | # --max_train_samples 10 \ 44 | # --torch_dtype bfloat16 \ 45 | # --block_size 2048 \ 46 | # --preprocessing_num_workers 8 \ 47 | # --trust_remote_code 1 \ 48 | # --do_train \ 49 | # --do_eval \ 50 | # --learning_rate 5e-5 \ 51 | # --num_train_epochs 1 \ 52 | # --per_device_train_batch_size 1 \ 53 | # --per_device_eval_batch_size 1 \ 54 | # --gradient_accumulation_steps 1 \ 55 | # --lr_scheduler_type cosine \ 56 | # --warmup_ratio 0.01 \ 57 | # --low_cpu_mem_usage True \ 58 | # --overwrite_output_dir True \ 59 | # --report_to wandb \ 60 | # --run_name codellama-7b-${degree}-mod \ 61 | # --resume_from_checkpoint /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/CodeLlama/checkpoint-5 \ 62 | 63 | 64 | 65 | # --max_train_samples 50 \ 66 | 67 | 68 | # python my_run_clm.py \ 69 | # --model_name_or_path meta-llama/CodeLlama-7b-hf \ 70 | # --train_file data/ft_final/my_code_contests_train_low.jsonl \ 71 | # --validation_file data/ft_final/my_code_contests_valid_low.jsonl \ 72 | # --output_dir tmp/CodeLlama \ 73 | # --save_steps 60 \ 74 | # --evaluation_strategy steps \ 75 | # --max_train_samples 50 \ 76 | # --max_eval_samples 1 \ 77 | # --torch_dtype bfloat16 \ 78 | # --block_size 2048 \ 79 | # --preprocessing_num_workers 8 \ 80 | # --trust_remote_code 1 \ 81 | # --do_train \ 82 | # --do_eval \ 83 | # --learning_rate 5e-5 \ 84 | # --num_train_epochs 2 \ 85 | # --per_device_train_batch_size 1 \ 86 | # --per_device_eval_batch_size 1 \ 87 | # --gradient_accumulation_steps 1 \ 88 | # --lr_scheduler_type cosine \ 89 | # --warmup_ratio 0.01 \ 90 | # --low_cpu_mem_usage True \ 91 | # --overwrite_output_dir True \ 92 | # --report_to wandb \ 93 | # --run_name codellama-7b-low-mod \ 94 | 95 | 96 | 97 | 98 | 99 | # # --max_eval_samples 50 \ 100 | # # --logging_steps 20 \ 101 | 102 | -------------------------------------------------------------------------------- /apps/eval/apps_metric.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Evaluation of code generation on the APPS benchmark""" 15 | 16 | import evaluate 17 | import datasets 18 | from .utils import compute_metrics 19 | from .testing_util import run_test 20 | 21 | 22 | _CITATION = """\ 23 | @article{hendrycksapps2021, 24 | title={Measuring Coding Challenge Competence With APPS}, 25 | author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt}, 26 | journal={NeurIPS}, 27 | year={2021} 28 | } 29 | """ 30 | 31 | 32 | _DESCRIPTION = """\ 33 | This is a metric to evaluate code generation using the APPS benchmark "Measuring Coding Challenge Competence With 34 | APPS" (https://arxiv.org/pdf/2105.09938.pdf). 35 | """ 36 | 37 | 38 | # TODO: Add description of the arguments of the module here 39 | _KWARGS_DESCRIPTION = """ 40 | Computes Average accuracy and strict accuracy for single generations, and pass@k for multiple generations. 41 | Args: 42 | predictions: list of code generations to score. It's a list of list(s), each corresponding to a problem from APPS dataset. 43 | 44 | Returns: 45 | metrics: dict of three metrics: average accuracy, stric accuracy, and pass@k. 46 | Examples: 47 | >>> my_new_module = evaluate.load("loubnabnl/apps_metric") 48 | >>> results = my_new_module.compute(predictions=[["s=input()\nprint(s)"]]) 49 | >>> print(results) 50 | {'avg_accuracy': 0, 'strict_accuracy': 0, 'pass_at_k': None} 51 | """ 52 | 53 | 54 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 55 | class apps_metric(evaluate.EvaluationModule): 56 | """Evaluate code generation on APPS benchmark. 57 | The generations are compiled and their corresponding unit tests are run""" 58 | 59 | def _info(self): 60 | return evaluate.EvaluationModuleInfo( 61 | module_type="metric", 62 | description=_DESCRIPTION, 63 | citation=_CITATION, 64 | inputs_description=_KWARGS_DESCRIPTION, 65 | features=datasets.Features( 66 | { 67 | "predictions": datasets.Sequence(datasets.Value("string")), 68 | } 69 | ), 70 | homepage="https://github.com/hendrycks/apps", 71 | reference_urls=["https://huggingface.co/datasets/codeparrot/apps"], 72 | ) 73 | 74 | def _compute( 75 | self, 76 | data, 77 | k_list=[1, 10, 100], 78 | count_errors=True, 79 | level="all", 80 | debug=False, 81 | split="test", 82 | column_name="extracted_solutions", 83 | ): 84 | """Returns the scores""" 85 | results, metrics = compute_metrics( 86 | data, 87 | k_list=k_list, 88 | count_errors=count_errors, 89 | level=level, 90 | debug=debug, 91 | split=split, 92 | column_name=column_name, 93 | ) 94 | return results, metrics 95 | -------------------------------------------------------------------------------- /apps/sc2tmc.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from datasets import Dataset 3 | from utils import * 4 | from eval.apps_metric import apps_metric 5 | 6 | from filter import * 7 | 8 | 9 | def mc_transform(question, sc): 10 | client = OpenAI(api_key=your_key) 11 | try: 12 | messages = [ 13 | { 14 | "role": "system", 15 | "content": "You are an AI programming assistant.", 16 | }, 17 | { 18 | "role": "user", 19 | "content": f"""QUESTION: 20 | {question} 21 | 22 | ANSWER: 23 | ```python 24 | {sc} 25 | ``` 26 | Refactor the above program. Follow the guidelines 27 | * make the program more modular with smaller and meaningful helper functions 28 | * good descriptive names for the helper functions 29 | * have an entry function called 'main()' 30 | * 'main()' is called inside 'if __name__ == '__main__'' 31 | 32 | Do not change the original semantics of the program significantly and no need to perform optimizations. Enclose the program within backticks as shown above.""", 33 | }, 34 | ] 35 | 36 | completion = client.chat.completions.create( 37 | model="gpt-3.5-turbo", 38 | messages=messages, 39 | max_tokens=1024, 40 | stop=["\n\n\n\n", "QUESTION:", "ANSWER:"], 41 | temperature=0.6, 42 | n=20, 43 | ) 44 | 45 | response = [] 46 | for choice in completion.choices: 47 | content = choice.message.content 48 | response.append(extract_solution(content)) 49 | return response 50 | 51 | except: 52 | return None 53 | 54 | 55 | def extract_solution(code): 56 | start_index = code.find("```python") 57 | if start_index == -1: 58 | solution = code 59 | else: 60 | end_index = code.find("```", start_index + len("```python")) 61 | if start_index < end_index: 62 | solution = code[start_index + len("```python") : end_index] 63 | else: 64 | solution = code[start_index + len("```python") :] 65 | return solution 66 | 67 | 68 | def main(): 69 | eval_apps = apps_metric() 70 | for seed in [27, 42, 101, 134, 169]: 71 | data = Dataset.from_json(f"data/2shot_demonstration_{seed}seed.json") 72 | dataset = data.map( 73 | lambda x: {"tmc": mc_transform(x["problem_description"], x["sc"])} 74 | ) 75 | results, _ = eval_apps._compute( 76 | dataset, k_list=[1], split="train", column_name="tmc" 77 | ) 78 | transformed_mc = [] 79 | for index in results: 80 | passed_code = [] 81 | for i, result in enumerate(results[index]): 82 | code = dataset["tmc"][int(index)][i] 83 | print(code) 84 | if all(x == True for x in result): 85 | visit = cc_visit(code) 86 | count = [ 87 | count_module_written(code, func.name) 88 | for func in visit.functions 89 | ] 90 | TF = all(x >= 2 for x in count) 91 | if len(count) >= 3 and TF: 92 | passed_code.append([code]) 93 | break 94 | if not len(passed_code) > 0: 95 | # raise ValueError("No code passed the criteria") 96 | break 97 | else: 98 | transformed_mc.append(passed_code[0]) 99 | if len(transformed_mc) == len(dataset): 100 | dataset = dataset.remove_columns(["tmc"]) 101 | dataset = dataset.add_column("transformed_mc", transformed_mc) 102 | dataset.to_json( 103 | f"data/2shot_demonstration_{seed}seed.json" 104 | ) 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /codecontests/preprocess_original_dataset_icl.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from datasets import load_dataset 4 | from utils.utils_evaluate import safe_eval_answer_from_agent 5 | from radon.complexity import cc_visit 6 | 7 | 8 | # delete all solutions in another language except python in the dataset 9 | def leave_python_solution(example): 10 | solutions = example['solutions']['solution'] 11 | language_index = example['solutions']['language'] 12 | 13 | python_solution = [] 14 | for i, lang in enumerate(language_index): 15 | if lang == 3: # python3 16 | python_solution.append(solutions[i]) 17 | 18 | example['solutions']['solution'] = python_solution 19 | del example['solutions']['language'] 20 | return example 21 | 22 | 23 | # remove annotated parts in the code 24 | def remove_annotation(example): 25 | def remove_annotation_(input_string): 26 | modified_string = re.sub(r"#.*?(?=\n)", '', input_string) 27 | modified_string = re.sub(r"'''.*?'''", '', modified_string, flags=re.DOTALL) 28 | modified_string = re.sub(r'""".*?"""', '', modified_string, flags=re.DOTALL) 29 | return modified_string 30 | 31 | for i in range(len(example['solutions']['solution'])): 32 | example['solutions']['solution'][i] = remove_annotation_(example['solutions']['solution'][i]) 33 | 34 | return example 35 | 36 | 37 | # calculate cc and module list of code and add them to dataset 38 | def add_cc_and_modules(example): 39 | ccs = [] 40 | modules = [] 41 | 42 | for code in example['solutions']['solution']: 43 | cc, module_name = get_avg_cc_and_module(code) 44 | ccs.append(cc) 45 | modules.append(module_name) 46 | 47 | example['solutions']['cc'] = ccs 48 | example['solutions']['modules'] = modules 49 | 50 | return example 51 | 52 | 53 | # calculate average cc of each solution code and add it 54 | def get_avg_cc_and_module(code): 55 | try: 56 | module_name = [] 57 | visitor = cc_visit(code) 58 | 59 | # 1. average cc of modules 60 | total_module_complexity = 0 61 | num_module = 0 62 | for module in visitor.blocks: 63 | # only consider function or method of class as module 64 | if module.__class__.__name__ == 'Function': 65 | module_name.append(module.name) 66 | total_module_complexity += module.complexity 67 | num_module += 1 68 | 69 | # 2. cc of body code 70 | body_complexity = visitor.complexity 71 | 72 | # 3. average cc of the program 73 | avg_cc = (total_module_complexity + body_complexity) / (num_module + 1) 74 | except: 75 | # cc_visit fails to return because the input code has some errors 76 | avg_cc = 0 77 | module_name = [] 78 | 79 | return avg_cc, module_name 80 | 81 | 82 | def start(split): 83 | base_dir = os.path.dirname(__file__) 84 | 85 | # load original dataset 86 | dataset = load_dataset("deepmind/code_contests", cache_dir='/data/huggingface/datasets') 87 | dataset = dataset[split] 88 | # 1. filter questions without any python solution 89 | dataset = dataset.filter(lambda example: 3 in example['solutions']['language']) 90 | # 2. retain only python solutions in problem 91 | dataset = dataset.map(leave_python_solution, num_proc=2) 92 | # 3: mark each python solution passed or not by running the test cases 93 | dataset = dataset.map(safe_eval_answer_from_agent, num_proc=1) 94 | # 4. remove annotation parts of code 95 | dataset = dataset.map(remove_annotation) 96 | # 5. add cc and modules names contained in the code to the dataset 97 | dataset = dataset.map(add_cc_and_modules, num_proc=16) 98 | # 6. save 99 | dataset.to_json(os.path.join(base_dir, 'data', f'my_code_contests_{split}.jsonl')) 100 | 101 | 102 | start('test') 103 | start('valid') 104 | start('train') -------------------------------------------------------------------------------- /codecontests/ppl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForCausalLM 3 | from utils.utils import read_jsonl_to_dict 4 | from tqdm import tqdm 5 | import argparse 6 | import random 7 | import numpy as np 8 | import os 9 | 10 | 11 | def set_seed(seed): 12 | random.seed(seed) 13 | np.random.seed(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed(seed) 16 | # When running on the CuDNN backend, two further options must be set 17 | torch.backends.cudnn.deterministic = True 18 | torch.backends.cudnn.benchmark = False 19 | # Set a fixed value for the hash seed 20 | os.environ["PYTHONHASHSEED"] = str(seed) 21 | 22 | 23 | set_seed(42) 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--gpu", type=int, required=True, default=0) 27 | parser.add_argument("--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf") 28 | parser.add_argument("--include_prompt", action='store_true') 29 | parser.add_argument("--mod", type=str, required=True) 30 | args = parser.parse_args() 31 | 32 | device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") 33 | tokenizer = AutoTokenizer.from_pretrained(args.model) 34 | 35 | if 'CodeLlama' in args.model: 36 | dtype = torch.float16 37 | elif 'deepseek' in args.model: 38 | dtype = torch.bfloat16 39 | 40 | model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=dtype) 41 | model = model.to(device) 42 | tokenizer.pad_token = tokenizer.eos_token 43 | 44 | path = f'/home/kdy20401/Workspace/Proj-Code-Generation/MC/data/my_code_contests_train_{args.mod}_mod.jsonl' 45 | dataset = read_jsonl_to_dict(path) 46 | 47 | losses = [] 48 | perplexity = [] 49 | # length = [] 50 | problems = [] 51 | for j, data in enumerate(dataset): 52 | description = data['description'] 53 | code = data['code'] 54 | 55 | if args.include_prompt == True: 56 | instruction = ( 57 | "Write a python code to solve the following coding problem " 58 | "that obeys the constraints and passes the example test cases. " 59 | "The output code needs to read from and write to standard IO. " 60 | "Please wrap your code answer using ```:" 61 | ) 62 | if 'CodeLlama' in args.model: 63 | prefix = "" 64 | prefix += "Q: " + instruction + "\n" 65 | prefix += description + "\n" 66 | prefix += "A: " 67 | elif 'deepseek' in args.model: 68 | prefix = "" 69 | prefix += instruction + '\n' 70 | prefix += "### Instruction:\n" + description + "\n" 71 | prefix += "### Response:\n" 72 | 73 | prompt = prefix + code 74 | all_tokens = tokenizer(prompt, return_tensors="pt", max_length=8192, truncation=True).to(device) 75 | prefix_tokens = tokenizer(prefix, return_tensors="pt", max_length=8192, truncation=True).to(device) 76 | code_start_index = len(prefix_tokens['input_ids'][0]) 77 | labels = all_tokens['input_ids'].clone() 78 | labels[:, :code_start_index] = -100 # ignore loss of prefix 79 | else: 80 | prompt = code 81 | all_tokens = tokenizer(prompt, return_tensors="pt", max_length=8192, truncation=True).to(device) 82 | labels = all_tokens['input_ids'] 83 | 84 | # problem 85 | problems.append(data['name']) 86 | with torch.no_grad(): 87 | outputs = model(all_tokens['input_ids'], labels=labels) 88 | loss = outputs.loss 89 | # loss 90 | losses.append(loss) 91 | 92 | ppl = torch.exp(outputs.loss).item() 93 | if ppl != torch.nan: 94 | perplexity.append(ppl) 95 | else: 96 | print('nan!') 97 | 98 | 99 | # print(min(length), max(length)) 100 | print(f'model: {args.model}') 101 | print(f'dataset of {args.mod} modularity') 102 | # print(f'average nll: {torch.stack(losses).mean()}') 103 | print(f'average ppl: {sum(perplexity) / len(perplexity)}') 104 | -------------------------------------------------------------------------------- /codecontests/evaluate_.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import multiprocessing 4 | import time 5 | 6 | from tqdm import tqdm 7 | 8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official 9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl 10 | 11 | from datasets import load_dataset 12 | 13 | from scipy import stats 14 | 15 | 16 | def _temp_run(code, tests, passed): 17 | try: 18 | flag, _ = verify_code_official(tests, code) 19 | passed.append(flag) 20 | except Exception as e: 21 | pass 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--seed", type=int, required=True, default=0) 27 | parser.add_argument( 28 | "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf" 29 | ) 30 | parser.add_argument("--num_icl_shot", type=int, required=True, default=2) 31 | parser.add_argument( 32 | "--num_gen", 33 | type=int, 34 | required=True, 35 | default=1, 36 | help="number of solutions generated per problem", 37 | ) 38 | parser.add_argument( 39 | "--temperature", 40 | type=float, 41 | default=0, 42 | required=True, 43 | help="0 means greedy decoding for vllm", 44 | ) 45 | parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k") 46 | 47 | args = parser.parse_args() 48 | 49 | base_directory = os.path.dirname(__file__) 50 | test_dataset = load_dataset( 51 | "deepmind/code_contests", split="test", 52 | ) 53 | 54 | result_file = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 55 | 56 | if not os.path.exists(os.path.join(base_directory, "result", result_file)): 57 | return 58 | 59 | if os.path.exists(os.path.join(base_directory, "result", "2shot_mc", result_file)): 60 | return 61 | 62 | result_data = read_jsonl_to_dict(os.path.join(base_directory, "result", result_file)) 63 | assert len(result_data) == 165 64 | 65 | start = time.time() 66 | passed_results = [] 67 | for i, data in enumerate(result_data): 68 | # make test cases for each problem 69 | tests = {"inputs": [], "outputs": []} 70 | tests["inputs"].extend(data["public_tests"]["input"]) 71 | tests["inputs"].extend(data["private_tests"]["input"]) 72 | tests["outputs"].extend(data["public_tests"]["output"]) 73 | tests["outputs"].extend(data["private_tests"]["output"]) 74 | assert len(tests["inputs"]) == len(tests["outputs"]) 75 | 76 | time_limit = test_dataset[i]["time_limit"]["seconds"] 77 | passed = [] 78 | for code in data["extracted_solutions"]: 79 | manager = multiprocessing.Manager() 80 | manager_list = manager.list() 81 | p = multiprocessing.Process( 82 | target=_temp_run, args=(code, tests, manager_list) 83 | ) 84 | p.start() 85 | p.join(timeout=time_limit + 1) 86 | 87 | if p.is_alive(): 88 | p.kill() 89 | if not manager_list: 90 | passed.append(0) 91 | else: 92 | if manager_list[0] == True: 93 | passed.append(1) 94 | else: 95 | passed.append(0) 96 | 97 | result_data[i]["passed"] = passed # new data 98 | passed_results.append(passed) 99 | 100 | print(f"time: {time.time() - start:.2f}s") 101 | ks = [args.k] 102 | performance = compute_pass_at_ks(passed_results, ks) 103 | print(f"pass@{ks[0]}: {performance}") 104 | # statistics for one dot in the correlation figure 105 | # add pass information to result_data and save 106 | write_dict_to_jsonl(result_data, os.path.join(base_directory, "result", "2shot_mc", result_file)) 107 | print(f'{result_file} saved.') 108 | 109 | print('program ends.') 110 | 111 | 112 | 113 | 114 | 115 | 116 | if __name__ == "__main__": 117 | main() 118 | -------------------------------------------------------------------------------- /codecontests/evaluate_gpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import multiprocessing 4 | import time 5 | 6 | from tqdm import tqdm 7 | 8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official 9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl 10 | 11 | from datasets import load_dataset 12 | 13 | from scipy import stats 14 | 15 | 16 | def _temp_run(code, tests, passed): 17 | try: 18 | flag, _ = verify_code_official(tests, code) 19 | passed.append(flag) 20 | except Exception as e: 21 | pass 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--seed", type=int, required=True, default=0) 27 | parser.add_argument( 28 | "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf" 29 | ) 30 | parser.add_argument("--num_icl_shot", type=int, required=True, default=2) 31 | parser.add_argument( 32 | "--num_gen", 33 | type=int, 34 | required=True, 35 | default=1, 36 | help="number of solutions generated per problem", 37 | ) 38 | parser.add_argument( 39 | "--temperature", 40 | type=float, 41 | default=0, 42 | required=True, 43 | help="0 means greedy decoding for vllm", 44 | ) 45 | parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k") 46 | parser.add_argument("--code_type", type=str, required=True) 47 | 48 | args = parser.parse_args() 49 | 50 | base_directory = os.path.dirname(__file__) 51 | test_dataset = load_dataset( 52 | "deepmind/code_contests", split="test", 53 | ) 54 | 55 | result_file = f"{args.model}_{args.code_type}_code_{args.num_icl_shot}shot_{args.num_gen}gen_{args.seed}seed_icl_result.jsonl" 56 | 57 | if not os.path.exists(os.path.join(base_directory, "result/gpt", result_file)): 58 | print('result file does not exist') 59 | return 60 | 61 | if os.path.exists(os.path.join(base_directory, "result/gpt/result", result_file)): 62 | print('result file already exists') 63 | return 64 | 65 | result_data = read_jsonl_to_dict(os.path.join(base_directory, "result/gpt", result_file)) 66 | print(f'result file path:') 67 | print(os.path.join(base_directory, "result/gpt/", result_file)) 68 | 69 | start = time.time() 70 | passed_results = [] 71 | for i, data in enumerate(result_data): 72 | # make test cases for each problem 73 | tests = {"inputs": [], "outputs": []} 74 | tests["inputs"].extend(data["public_tests"]["input"]) 75 | tests["inputs"].extend(data["private_tests"]["input"]) 76 | tests["outputs"].extend(data["public_tests"]["output"]) 77 | tests["outputs"].extend(data["private_tests"]["output"]) 78 | assert len(tests["inputs"]) == len(tests["outputs"]) 79 | 80 | time_limit = test_dataset[i]["time_limit"]["seconds"] 81 | passed = [] 82 | for code in data["extracted_solutions"]: 83 | manager = multiprocessing.Manager() 84 | manager_list = manager.list() 85 | p = multiprocessing.Process( 86 | target=_temp_run, args=(code, tests, manager_list) 87 | ) 88 | p.start() 89 | p.join(timeout=time_limit + 1) 90 | 91 | if p.is_alive(): 92 | p.kill() 93 | if not manager_list: 94 | passed.append(0) 95 | else: 96 | if manager_list[0] == True: 97 | passed.append(1) 98 | else: 99 | passed.append(0) 100 | 101 | result_data[i]["passed"] = passed # new data 102 | passed_results.append(passed) 103 | 104 | # print(f"time: {time.time() - start:.2f}s") 105 | ks = [args.k] 106 | performance = compute_pass_at_ks(passed_results, ks) 107 | print(f"pass@{ks[0]}: {performance}") 108 | # statistics for one dot in the correlation figure 109 | # add pass information to result_data and save 110 | write_dict_to_jsonl(result_data, os.path.join(base_directory, "result/gpt/result", result_file)) 111 | # print(f'{result_file} saved.') 112 | 113 | # print('program ends.') 114 | 115 | 116 | 117 | 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /codecontests/utils/utils_evaluate.py: -------------------------------------------------------------------------------- 1 | # import gzip 2 | import io 3 | import itertools 4 | import json 5 | # import pprint 6 | import numpy as np 7 | # import re 8 | # import sys 9 | # import timeout_decorator 10 | # import traceback 11 | # from collections import Counter 12 | # from io import StringIO 13 | import sys 14 | # from collections import defaultdict 15 | # from datasets import concatenate_datasets, load_dataset 16 | # from multiprocessing import Process, Queue 17 | import multiprocessing 18 | # from tqdm import tqdm 19 | from typing import Dict, List, Union 20 | # import os 21 | # import ast 22 | # import random 23 | # import subprocess 24 | # import tempfile, shutil, os 25 | # from pyext import RuntimeModule 26 | from copy import deepcopy, copy 27 | # from functools import wraps 28 | import time 29 | import contextlib 30 | import pdb 31 | 32 | from utils.utils_execute import run_test 33 | 34 | GLOBAL_TIMEOUT = 10 # TIMEOUT for one solution 35 | 36 | 37 | def safe_eval_answer_from_agent(example): 38 | def _temp_run(code, tests, result): 39 | try: 40 | flag, outcomes = verify_code_official(tests, code) 41 | result.append(flag) 42 | except Exception as e: 43 | pass 44 | 45 | tests = {'inputs': [], 'outputs': []} 46 | tests['inputs'].extend(example['public_tests']['input']) 47 | tests['inputs'].extend(example['private_tests']['input']) 48 | tests['outputs'].extend(example['public_tests']['output']) 49 | tests['outputs'].extend(example['private_tests']['output']) 50 | passed = [] 51 | 52 | for code in example['solutions']['solution']: 53 | manager = multiprocessing.Manager() 54 | result = manager.list() 55 | p = multiprocessing.Process(target=_temp_run, args=(code, tests, result)) 56 | p.start() 57 | p.join(timeout=GLOBAL_TIMEOUT + 1) 58 | if p.is_alive(): 59 | p.kill() 60 | if not result: 61 | result = [-1] 62 | 63 | if result[0] == True: 64 | passed.append(True) 65 | else: 66 | passed.append(False) 67 | 68 | example['solutions']['passed'] = passed 69 | return example 70 | 71 | def verify_code_official(tests, solution, debug=False, return_output=False): 72 | ''' verify if code passes all tests, using apps official implementation (https://github.com/hendrycks/apps/blob/main/eval/testing_util.py#L122) 73 | ''' 74 | tests = deepcopy(tests) 75 | # suppress the stdout of solution execution 76 | # todo: suppress stderr as well 77 | with contextlib.redirect_stdout(io.StringIO()): 78 | results = run_test(tests, solution, debug=debug, return_output=return_output) 79 | if return_output: 80 | tmp = results 81 | all_outputs = results[1] 82 | results = results[0] 83 | if all([res == True for res in results]): 84 | if return_output: 85 | return True, results, all_outputs 86 | return True, results 87 | else: 88 | if return_output: 89 | return False, results, all_outputs 90 | return False, results 91 | 92 | def estimate_pass_at_k( 93 | num_samples: Union[int, List[int], np.ndarray], 94 | num_correct: Union[List[int], np.ndarray], 95 | k: int, 96 | ) -> np.ndarray: 97 | """ 98 | Estimates pass@k of each problem and returns them in an array. 99 | Taken from https://github.com/openai/human-eval/blob/master/human_eval/evaluation.py#L13. 100 | """ 101 | def estimator(n: int, c: int, k: int) -> float: 102 | """ 103 | Calculates 1 - comb(n - c, k) / comb(n, k). 104 | """ 105 | if n - c < k: 106 | return 1.0 107 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 108 | 109 | if isinstance(num_samples, int): 110 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 111 | else: 112 | assert len(num_samples) == len(num_correct) 113 | num_samples_it = iter(num_samples) 114 | 115 | return np.array( 116 | [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] 117 | ) 118 | 119 | def compute_pass_at_ks(results, ks): 120 | output = { 121 | k: estimate_pass_at_k( 122 | [len(x) for x in results], 123 | [sum([i == True for i in x]) for x in results], 124 | k, 125 | ).mean() 126 | for k in ks 127 | } 128 | return output -------------------------------------------------------------------------------- /apps/icl.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | num_icl_shot=2 4 | code_type = $2 5 | 6 | if [ $1 == deepseek ]; then 7 | model=deepseek-ai/deepseek-coder-6.7b-base 8 | else 9 | model=meta-llama/CodeLlama-7b-hf 10 | fi 11 | 12 | task0() { 13 | local seed=$1 14 | llama_size=7 15 | llama_model=meta-llama/CodeLlama-${llama_size}b-hf 16 | CUDA_VISIBLE_DEVICES=0,1 python -u icl.py \ 17 | --seed $seed --model ${llama_model} \ 18 | --num_gpu ${num_gpu} --dtype float16 --num_icl_shot ${num_icl_shot} \ 19 | --num_gen 10 --code_type mc \ 20 | --temperature ${temperature} --max_new_token 1024 \ 21 | --top_p 0.95 --modify original --swap_space ${swap_space} \ 22 | > log/codellama_${llama_size}b_${num_icl_shot}shot_${temperature}temp_mc_${seed}.log 2>&1 23 | task1_completed $seed 24 | } 25 | 26 | task1() { 27 | local seed=$1 28 | llama_size=7 29 | llama_model=meta-llama/CodeLlama-${llama_size}b-hf 30 | CUDA_VISIBLE_DEVICES=0,1 python -u icl.py \ 31 | --seed $seed --model ${llama_model} \ 32 | --num_gpu ${num_gpu} --dtype float16 --num_icl_shot ${num_icl_shot} \ 33 | --num_gen 10 --code_type mc \ 34 | --temperature ${temperature} --max_new_token 1024 \ 35 | --top_p 0.95 --modify original --swap_space ${swap_space} \ 36 | > log/codellama_${llama_size}b_${num_icl_shot}shot_${temperature}temp_mc_${seed}.log 2>&1 37 | task1_completed $seed 38 | } 39 | task2() { 40 | local seed=$1 41 | deepseek_size=6.7 42 | deepseek_model=deepseek-ai/deepseek-coder-${deepseek_size}b-base 43 | CUDA_VISIBLE_DEVICES=0,1 python -u icl.py \ 44 | --seed $seed --model ${deepseek_model} \ 45 | --num_gpu ${num_gpu} --dtype bfloat16 --num_icl_shot ${num_icl_shot} \ 46 | --num_gen 10 --code_type mc \ 47 | --temperature ${temperature} --max_new_token 1024 \ 48 | --top_p 0.95 --modify original --swap_space ${swap_space} \ 49 | > log/deepseek_${deepseek_size}b_${num_icl_shot}shot_${temperature}temp_mc_${seed}.log 2>&1 50 | task2_completed $seed 51 | } 52 | task3() { 53 | local seed=$1 54 | llama_size=7 55 | llama_model=meta-llama/CodeLlama-${llama_size}b-hf 56 | CUDA_VISIBLE_DEVICES=2,3 python -u icl.py \ 57 | --seed $seed --model ${llama_model} \ 58 | --num_gpu ${num_gpu} --dtype float16 --num_icl_shot ${num_icl_shot} \ 59 | --num_gen 10 --code_type sc \ 60 | --temperature ${temperature} --max_new_token 1024 \ 61 | --top_p 0.95 --modify original --swap_space ${swap_space} \ 62 | > log/codellama_${llama_size}b_${num_icl_shot}shot_${temperature}temp_sc_${seed}.log 2>&1 63 | task3_completed $seed 64 | } 65 | 66 | task4() { 67 | local seed=$1 68 | deepseek_size=6.7 69 | deepseek_model=deepseek-ai/deepseek-coder-${deepseek_size}b-base 70 | CUDA_VISIBLE_DEVICES=3 python -u icl.py \ 71 | --seed $seed --model ${deepseek_model} \ 72 | --num_gpu ${num_gpu} --dtype bfloat16 --num_icl_shot ${num_icl_shot} \ 73 | --num_gen 10 --code_type sc \ 74 | --temperature ${temperature} --max_new_token 1024 \ 75 | --top_p 0.95 --modify original --swap_space ${swap_space} \ 76 | > log/deepseek_${deepseek_size}b_${num_icl_shot}shot_${temperature}temp_sc_${seed}.log 2>&1 77 | task4_completed $seed 78 | } 79 | task1_completed() { 80 | local seed=$1 81 | # Start task1 for the next seed 82 | next_seed=$(next_seed $seed) 83 | if [ -n "$next_seed" ]; then 84 | task1 $next_seed & 85 | fi 86 | } 87 | 88 | task2_completed() { 89 | local seed=$1 90 | # Start task2 for the next seed 91 | next_seed=$(next_seed $seed) 92 | if [ -n "$next_seed" ]; then 93 | task2 $next_seed & 94 | fi 95 | } 96 | 97 | task3_completed() { 98 | local seed=$1 99 | # Start task1 for the next seed 100 | next_seed=$(next_seed $seed) 101 | if [ -n "$next_seed" ]; then 102 | task3 $next_seed & 103 | fi 104 | } 105 | 106 | task4_completed() { 107 | local seed=$1 108 | # Start task2 for the next seed 109 | next_seed=$(next_seed $seed) 110 | if [ -n "$next_seed" ]; then 111 | task4 $next_seed & 112 | fi 113 | } 114 | 115 | next_seed() { 116 | local seed=$1 117 | case $seed in 118 | 27) echo 42 ;; 119 | 42) echo 101 ;; 120 | 101) echo "" ;; 121 | 169) echo "" ;; 122 | esac 123 | } 124 | 125 | temperature=0.1 126 | num_gpu=2 127 | swap_space=$((64/num_gpu)) 128 | num_icl_shot=2 129 | 130 | 131 | # Start the first tasks 132 | task0 134 & 133 | # task1 27 & 134 | # task3 27 & 135 | # task2 27 & 136 | # task4 27 & 137 | 138 | # Wait for all background jobs to finish 139 | wait -------------------------------------------------------------------------------- /codecontests/calculate_corr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Calculate Correlation (execute after evaluation)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# CL 7b, DS 7b modularity\n", 17 | "\n", 18 | "import os\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "\n", 21 | "from utils.utils_evaluate import compute_pass_at_ks, verify_code_official\n", 22 | "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_modularity_score\n", 23 | "\n", 24 | "from scipy import stats\n", 25 | "\n", 26 | "\n", 27 | "models = ['meta-llama-CodeLlama-7b-hf', 'deepseek-ai-deepseek-coder-6.7b-base']\n", 28 | "code_element = ['modularity']\n", 29 | "\n", 30 | "num_code = 100\n", 31 | "num_gen = 10\n", 32 | "k = 1\n", 33 | "base_directory = os.getcwd()\n", 34 | "for model in models:\n", 35 | " codes = []\n", 36 | " for element in code_element:\n", 37 | " num_point = 0\n", 38 | " performances = []\n", 39 | " element_values = []\n", 40 | " for code_idx in range(num_code):\n", 41 | " file_name = f'{model}_1shot_10gen_0.1temp_{element}_{code_idx}code_icl_result.jsonl'\n", 42 | " if not os.path.exists(os.path.join(base_directory, \"result\", \"corr_exp_evaluation_result\", file_name)):\n", 43 | " continue\n", 44 | " \n", 45 | " num_point += 1 # number of points in the correlation plot (=number of evaluation result)\n", 46 | " results = read_jsonl_to_dict(os.path.join(base_directory, \"result\", \"corr_exp_evaluation_result\", file_name))\n", 47 | " passed_results = []\n", 48 | " for result in results:\n", 49 | " assert len(result['passed']) == num_gen\n", 50 | " passed_results.append(result['passed'])\n", 51 | "\n", 52 | " # code\n", 53 | " codes.append(results[0]['demonstration']['code'][0])\n", 54 | " \n", 55 | " # pass@k\n", 56 | " performances.append(compute_pass_at_ks(passed_results, [k])[k])\n", 57 | " \n", 58 | " # style or modularity\n", 59 | " if element == 'style':\n", 60 | " element_values.append(results[0]['demonstration']['score_style'][0]['score_pep8'])\n", 61 | " elif element == 'modularity':\n", 62 | " element_values.append(results[0]['demonstration']['score_modularity'][0])\n", 63 | " \n", 64 | " # re calculate modularity\n", 65 | " # element_values = []\n", 66 | " # for code in codes:\n", 67 | " # element_values.append(get_code_modularity_score(code))\n", 68 | "\n", 69 | " # plt.scatter(element_values, [0.5] * len(element_values), color='blue', label='only mod')\n", 70 | "\n", 71 | " # calculate correlation\n", 72 | " pearsonr_stat = stats.pearsonr(element_values, performances)\n", 73 | " pearsonr, pearsonr_p = pearsonr_stat.correlation, pearsonr_stat.pvalue\n", 74 | " spearmanr_stat = stats.spearmanr(element_values, performances)\n", 75 | " spearmanr, spearmanr_p = spearmanr_stat.correlation, spearmanr_stat.pvalue\n", 76 | " \n", 77 | " performances = [performance * 100 for performance in performances] # for better visualization\n", 78 | " plt.scatter(element_values, performances, color='red', label='Sampled Data')\n", 79 | " plt.xlabel(element)\n", 80 | " plt.ylabel('pass@k')\n", 81 | " plt.legend()\n", 82 | " plt.show()\n", 83 | " \n", 84 | " print(f'model: {model}')\n", 85 | " print(f'pearsonr: {round(pearsonr, 2)}, pearsonr_p: {round(pearsonr_p, 2)}')\n", 86 | " print(f'spearmanr: {round(spearmanr, 2)}, spearmanr_p: {round(spearmanr_p, 2)}')\n", 87 | " print(f'num data: {num_point}')\n" 88 | ] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "mcg", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.9.19" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | -------------------------------------------------------------------------------- /codecontests/evaluate_ft.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import multiprocessing 4 | import time 5 | 6 | from tqdm import tqdm 7 | 8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official 9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl 10 | 11 | from datasets import load_dataset 12 | 13 | from scipy import stats 14 | 15 | 16 | def _temp_run(code, tests, passed): 17 | try: 18 | flag, _ = verify_code_official(tests, code) 19 | passed.append(flag) 20 | except Exception as e: 21 | pass 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--seed", type=int, required=True, default=0) 27 | parser.add_argument( 28 | "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf" 29 | ) 30 | parser.add_argument("--num_icl_shot", type=int, required=True, default=2) 31 | parser.add_argument( 32 | "--num_gen", 33 | type=int, 34 | required=True, 35 | default=1, 36 | help="number of solutions generated per problem", 37 | ) 38 | parser.add_argument( 39 | "--temperature", 40 | type=float, 41 | default=0, 42 | required=True, 43 | help="0 means greedy decoding for vllm", 44 | ) 45 | parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k") 46 | parser.add_argument('--degree', type=str, required=False, default='low') 47 | parser.add_argument('--chkpt', type=str, required=False, default=0) 48 | 49 | args = parser.parse_args() 50 | 51 | base_directory = os.path.dirname(__file__) 52 | test_dataset = load_dataset( 53 | "deepmind/code_contests", split="test", 54 | ) 55 | 56 | if "CodeLlama" in args.model: 57 | result_file = f"CodeLlama_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 58 | elif "deepseek" in args.model: 59 | result_file = f"DeepSeek_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 60 | 61 | # result_file = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 62 | 63 | if not os.path.exists(os.path.join(base_directory, "result/ft", result_file)): 64 | print('result file does not exist') 65 | return 66 | 67 | if os.path.exists(os.path.join(base_directory, "result/ft/result", result_file)): 68 | print('result file already exists') 69 | return 70 | 71 | result_data = read_jsonl_to_dict(os.path.join(base_directory, "result/ft", result_file)) 72 | print(f'result file path:') 73 | print(os.path.join(base_directory, "result/ft/", result_file)) 74 | 75 | start = time.time() 76 | passed_results = [] 77 | for i, data in enumerate(tqdm(result_data)): 78 | # make test cases for each problem 79 | tests = {"inputs": [], "outputs": []} 80 | tests["inputs"].extend(data["public_tests"]["input"]) 81 | tests["inputs"].extend(data["private_tests"]["input"]) 82 | tests["outputs"].extend(data["public_tests"]["output"]) 83 | tests["outputs"].extend(data["private_tests"]["output"]) 84 | assert len(tests["inputs"]) == len(tests["outputs"]) 85 | 86 | time_limit = test_dataset[i]["time_limit"]["seconds"] 87 | passed = [] 88 | for code in data["extracted_solutions"]: 89 | manager = multiprocessing.Manager() 90 | manager_list = manager.list() 91 | p = multiprocessing.Process( 92 | target=_temp_run, args=(code, tests, manager_list) 93 | ) 94 | p.start() 95 | p.join(timeout=time_limit + 1) 96 | 97 | if p.is_alive(): 98 | p.kill() 99 | if not manager_list: 100 | passed.append(0) 101 | else: 102 | if manager_list[0] == True: 103 | passed.append(1) 104 | else: 105 | passed.append(0) 106 | 107 | result_data[i]["passed"] = passed # new data 108 | passed_results.append(passed) 109 | 110 | print(f"time: {time.time() - start:.2f}s") 111 | ks = [args.k] 112 | performance = compute_pass_at_ks(passed_results, ks) 113 | print(f"pass@{ks[0]}: {performance}") 114 | # statistics for one dot in the correlation figure 115 | # add pass information to result_data and save 116 | write_dict_to_jsonl(result_data, os.path.join(base_directory, "result/ft/result", result_file)) 117 | print(f'{result_file} saved.') 118 | 119 | print('program ends.') 120 | 121 | 122 | 123 | 124 | 125 | 126 | if __name__ == "__main__": 127 | main() 128 | -------------------------------------------------------------------------------- /codecontests/evaluate_corr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import multiprocessing 4 | import time 5 | 6 | from tqdm import tqdm 7 | 8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official 9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl 10 | 11 | from datasets import load_dataset 12 | 13 | from scipy import stats 14 | 15 | 16 | def _temp_run(code, tests, passed): 17 | try: 18 | flag, _ = verify_code_official(tests, code) 19 | passed.append(flag) 20 | except Exception as e: 21 | pass 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument( 27 | "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf" 28 | ) 29 | parser.add_argument("--num_icl_shot", type=int, required=True, default=2) 30 | parser.add_argument( 31 | "--num_gen", 32 | type=int, 33 | required=True, 34 | default=1, 35 | help="number of solutions generated per problem", 36 | ) 37 | parser.add_argument( 38 | "--temperature", 39 | type=float, 40 | default=0, 41 | required=True, 42 | help="0 means greedy decoding for vllm", 43 | ) 44 | parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k") 45 | parser.add_argument( 46 | "--metric", 47 | type=str, 48 | required=True, 49 | default='style', 50 | help="code metric (e.g., style or modularity)", 51 | ) 52 | args = parser.parse_args() 53 | 54 | base_directory = os.path.dirname(__file__) 55 | test_dataset = load_dataset( 56 | "deepmind/code_contests", split="test", cache_dir="/data/huggingface/datasets" 57 | ) 58 | 59 | performances = [] 60 | metrics = [] 61 | 62 | for code_idx in tqdm(range(100)): 63 | result_file = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.metric}_{code_idx}code_icl_result.jsonl" 64 | if not os.path.exists(os.path.join(base_directory, "result", result_file)): 65 | continue 66 | 67 | if os.path.exists(os.path.join(base_directory, "result", "corr_exp_evaluation_result", result_file)): 68 | continue 69 | 70 | result_data = read_jsonl_to_dict(os.path.join(base_directory, "result", result_file)) 71 | assert len(result_data) == 165 72 | 73 | start = time.time() 74 | passed_results = [] 75 | for i, data in enumerate(result_data): 76 | # make test cases for each problem 77 | tests = {"inputs": [], "outputs": []} 78 | tests["inputs"].extend(data["public_tests"]["input"]) 79 | tests["inputs"].extend(data["private_tests"]["input"]) 80 | tests["outputs"].extend(data["public_tests"]["output"]) 81 | tests["outputs"].extend(data["private_tests"]["output"]) 82 | assert len(tests["inputs"]) == len(tests["outputs"]) 83 | 84 | time_limit = test_dataset[i]["time_limit"]["seconds"] 85 | passed = [] 86 | for code in data["extracted_solutions"]: 87 | manager = multiprocessing.Manager() 88 | manager_list = manager.list() 89 | p = multiprocessing.Process( 90 | target=_temp_run, args=(code, tests, manager_list) 91 | ) 92 | p.start() 93 | p.join(timeout=time_limit + 1) 94 | 95 | if p.is_alive(): 96 | p.kill() 97 | if not manager_list: 98 | passed.append(0) 99 | else: 100 | if manager_list[0] == True: 101 | passed.append(1) 102 | else: 103 | passed.append(0) 104 | 105 | result_data[i]["passed"] = passed # new data 106 | passed_results.append(passed) 107 | 108 | print(f"time: {time.time() - start:.2f}s") 109 | ks = [args.k] 110 | performance = compute_pass_at_ks(passed_results, ks) 111 | print(f"pass@{ks[0]}: {performance}") 112 | # statistics for one dot in the correlation figure 113 | performances.append(performance) 114 | metrics.append(result_data[0]['demonstration']) 115 | # add pass information to result_data and save 116 | write_dict_to_jsonl(result_data, os.path.join(base_directory, "result", "corr_exp_evaluation_result", result_file)) 117 | print(f'{result_file} saved.') 118 | 119 | print('program ends.') 120 | 121 | # # compute correlation 122 | # pass_at_k = [e[args.k] for e in performances] 123 | 124 | # if args.metric == 'style': 125 | # style = [m['score_style'][0]['score_pep8'] for m in metrics] 126 | # print(stats.pearsonr(style, pass_at_k)) 127 | # print(stats.spearmanr(style, pass_at_k)) 128 | # elif args.metric == 'modularity': 129 | # modularity = [m['score_modularity'][0] for m in metrics] 130 | # print(stats.pearsonr(modularity, pass_at_k)) 131 | # print(stats.spearmanr(modularity, pass_at_k)) 132 | 133 | 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /codecontests/construct_mc_sc_divided_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# divide existing codes into monolithic and modular codes by certain criteria\n", 10 | "# (ex, average cc and number of modules used)\n", 11 | "def divide_into_monolithic_and_modular_codes(dataset, cc_limit=10, min_num_module=3):\n", 12 | " from utils.utils import count_module_written\n", 13 | " \n", 14 | " \n", 15 | " new_dataset = []\n", 16 | "\n", 17 | " for data in dataset:\n", 18 | " # save basic information\n", 19 | " new_data = {}\n", 20 | " new_data['problem_name'] = data['name']\n", 21 | " new_data['problem_description'] = data['description']\n", 22 | " new_data['public_tests'] = data['public_tests']\n", 23 | " new_data['private_tests'] = data['private_tests']\n", 24 | "\n", 25 | " passed = data['solutions']['passed']\n", 26 | " cc = data['solutions']['cc']\n", 27 | " solution = data['solutions']['solution']\n", 28 | " module_list = data['solutions']['modules']\n", 29 | "\n", 30 | " assert(len(passed) == len(cc) == len(solution) == len(module_list))\n", 31 | "\n", 32 | " # 1. get monolithic code\n", 33 | " monolithic_code_index = []\n", 34 | " for i, modules in enumerate(module_list):\n", 35 | " # filter solution that does not pass the test case\n", 36 | " if not passed[i]:\n", 37 | " continue\n", 38 | " \n", 39 | " if len(modules) == 0 and cc[i] >= cc_limit:\n", 40 | " monolithic_code_index.append(i)\n", 41 | "\n", 42 | " # no monolithic code candidate exists\n", 43 | " # if len(monolithic_code_index) == 0:\n", 44 | " # continue\n", 45 | "\n", 46 | " tmp = {}\n", 47 | " tmp['monolithic_code'] = [solution[i] for i in monolithic_code_index]\n", 48 | " tmp['monolithic_code_cc'] = [cc[i] for i in monolithic_code_index]\n", 49 | " new_data['monolithic_codes'] = tmp\n", 50 | " \n", 51 | " # 2. get modular code\n", 52 | " modular_code_index = []\n", 53 | " for i, (code, modules) in enumerate(zip(solution, module_list)):\n", 54 | " # filter solution that does not pass the test case\n", 55 | " if not passed[i]:\n", 56 | " continue\n", 57 | " \n", 58 | " if len(modules) < min_num_module: continue # at least three modules in the code\n", 59 | " module_use_count = [count_module_written(code, module) for module in modules]\n", 60 | " if all(count >= 2 for count in module_use_count): # all modules must be used\n", 61 | " if cc[i] < cc_limit: # and cc of code must be under 10\n", 62 | " modular_code_index.append(i)\n", 63 | " \n", 64 | " # no modular code candidate exists\n", 65 | " # if len(modular_code_index) == 0:\n", 66 | " # continue\n", 67 | " \n", 68 | " tmp = {}\n", 69 | " tmp['modular_code'] = [solution[i] for i in modular_code_index]\n", 70 | " tmp['modular_code_cc'] = [cc[i] for i in modular_code_index]\n", 71 | " new_data['modular_codes'] = tmp\n", 72 | "\n", 73 | " new_dataset.append(new_data)\n", 74 | " \n", 75 | " \n", 76 | " # 3. remove question without pair data is collected\n", 77 | " remove_index = []\n", 78 | " for i, data in enumerate(new_dataset):\n", 79 | " # at least one monolithic code must exist per problem\n", 80 | " # it is okay to have no modular code\n", 81 | " if len(data['monolithic_codes']['monolithic_code']) == 0:\n", 82 | " remove_index.append(i)\n", 83 | " new_dataset = [new_dataset[i] for i in range(len(new_dataset)) if i not in remove_index]\n", 84 | "\n", 85 | " \n", 86 | " return new_dataset" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "### load my codecontests dataset and extract problems with both sc and mc codes" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 1, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stderr", 103 | "output_type": "stream", 104 | "text": [ 105 | "/home/kdy20401/anaconda3/envs/code/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 106 | " from .autonotebook import tqdm as notebook_tqdm\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl\n", 112 | "import os\n", 113 | "\n", 114 | "train_dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))\n", 115 | "\n", 116 | "_train_dataset = divide_into_monolithic_and_modular_codes(train_dataset)\n", 117 | "\n", 118 | "write_dict_to_jsonl(_train_dataset, os.path.join(os.getcwd(), 'data', 'my_code_contests_divided_train.jsonl'))\n" 119 | ] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "code", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.9.18" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 2 143 | } 144 | -------------------------------------------------------------------------------- /codecontests/icl_ft.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # num_gpu=4 4 | # dtype=float16 5 | # num_icl_shot=2 6 | # num_gen=50 7 | # temperature=0.6 8 | # swap_space=8 9 | # code_type=modular 10 | 11 | # for size in 34; do 12 | # for seed in 27 42 101 134 169; do 13 | # CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \ 14 | # --seed ${seed} \ 15 | # --model meta-llama/CodeLlama-${size}b-hf \ 16 | # --num_gpu ${num_gpu} \ 17 | # --dtype ${dtype} \ 18 | # --num_icl_shot ${num_icl_shot} \ 19 | # --num_gen ${num_gen} \ 20 | # --temperature ${temperature} \ 21 | # --max_new_token 1024 \ 22 | # --top_p 0.95 \ 23 | # --swap_space ${swap_space} \ 24 | # --code_type ${code_type} \ 25 | # > log/cl${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 26 | # echo cl${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends 27 | # done 28 | # done 29 | 30 | # # CL 7b, pass@1(n=10) 31 | # num_gpu=4 32 | # dtype=float16 33 | # num_icl_shot=2 34 | # num_gen=10 35 | # temperature=0.1 36 | # swap_space=8 37 | # code_type=monolithic 38 | 39 | # for size in 34; do 40 | # for seed in 27 42 101 134 169; do 41 | # CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \ 42 | # --seed ${seed} \ 43 | # --model meta-llama/CodeLlama-${size}b-hf \ 44 | # --num_gpu ${num_gpu} \ 45 | # --dtype ${dtype} \ 46 | # --num_icl_shot ${num_icl_shot} \ 47 | # --num_gen ${num_gen} \ 48 | # --temperature ${temperature} \ 49 | # --max_new_token 1024 \ 50 | # --top_p 0.95 \ 51 | # --swap_space ${swap_space} \ 52 | # --code_type ${code_type} \ 53 | # > log/inference/2shot_mc/cl${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 54 | # echo cl${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends 55 | # done 56 | # done 57 | 58 | 59 | # # DS 60 | # num_gpu=4 61 | # dtype=bfloat16 62 | # num_icl_shot=2 63 | # num_gen=50 64 | # temperature=0.1 65 | # swap_space=8 66 | # code_type=modular 67 | 68 | # for size in 33; do 69 | # for seed in 27 42 101 134 169; do 70 | # CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \ 71 | # --seed ${seed} \ 72 | # --model deepseek-ai/deepseek-coder-${size}b-base \ 73 | # --num_gpu ${num_gpu} \ 74 | # --dtype ${dtype} \ 75 | # --num_icl_shot ${num_icl_shot} \ 76 | # --num_gen ${num_gen} \ 77 | # --temperature ${temperature} \ 78 | # --max_new_token 1024 \ 79 | # --top_p 0.95 \ 80 | # --swap_space ${swap_space} \ 81 | # --code_type ${code_type} \ 82 | # > log/inference/2shot_mc/ds${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 83 | # echo ds${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends 84 | # done 85 | # done 86 | 87 | 88 | # num_gpu=4 89 | # dtype=float16 90 | # num_icl_shot=2 91 | # num_gen=50 92 | # temperature=0.6 93 | # swap_space=8 94 | # code_type=modular 95 | 96 | # for size in 34; do 97 | # for seed in 27 42 101 134 169; do 98 | # CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \ 99 | # --seed ${seed} \ 100 | # --model meta-llama/CodeLlama-${size}b-hf \ 101 | # --num_gpu ${num_gpu} \ 102 | # --dtype ${dtype} \ 103 | # --num_icl_shot ${num_icl_shot} \ 104 | # --num_gen ${num_gen} \ 105 | # --temperature ${temperature} \ 106 | # --max_new_token 1024 \ 107 | # --top_p 0.95 \ 108 | # --swap_space ${swap_space} \ 109 | # --code_type ${code_type} \ 110 | # > log/inference/2shot_mc/cl${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 111 | # echo cl${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends 112 | # done 113 | # done 114 | 115 | # # DS 116 | # num_gpu=4 117 | # dtype=bfloat16 118 | # num_icl_shot=2 119 | # num_gen=50 120 | # temperature=0.6 121 | # swap_space=8 122 | # code_type=modular 123 | 124 | # for size in 33; do 125 | # for seed in 27 42 101 134 169; do 126 | # CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \ 127 | # --seed ${seed} \ 128 | # --model deepseek-ai/deepseek-coder-${size}b-base \ 129 | # --num_gpu ${num_gpu} \ 130 | # --dtype ${dtype} \ 131 | # --num_icl_shot ${num_icl_shot} \ 132 | # --num_gen ${num_gen} \ 133 | # --temperature ${temperature} \ 134 | # --max_new_token 1024 \ 135 | # --top_p 0.95 \ 136 | # --swap_space ${swap_space} \ 137 | # --code_type ${code_type} \ 138 | # > log/inference/2shot_mc/ds${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 139 | # echo ds${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends 140 | # done 141 | # done 142 | 143 | 144 | # inference from ft checkpoint, pass@1(n=10) 145 | seed=27 146 | model=deepseek 147 | num_gpu=2 148 | dtype=float16 149 | num_icl_shot=0 150 | temperature=0.6 151 | code_type=monolithic 152 | swap_space=16 153 | chkpt=_final 154 | num_gen=50 # 155 | debug_mode=0 # 156 | 157 | # for low and high model simultaneously 158 | degree=low 159 | CUDA_VISIBLE_DEVICES=0,1 nohup python icl_ft.py \ 160 | --seed ${seed} \ 161 | --model /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/${model}/${degree}/ \ 162 | --num_gpu ${num_gpu} \ 163 | --dtype ${dtype} \ 164 | --num_icl_shot ${num_icl_shot} \ 165 | --num_gen ${num_gen} \ 166 | --temperature ${temperature} \ 167 | --max_new_token 1024 \ 168 | --top_p 0.95 \ 169 | --swap_space ${swap_space} \ 170 | --code_type ${code_type} \ 171 | --degree ${degree} \ 172 | --debug_mode ${debug_mode} \ 173 | --chkpt ${chkpt} \ 174 | > log/inference/tmp/${model}_${degree}_mod_chkpt${chkpt}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 & 175 | 176 | degree=high 177 | CUDA_VISIBLE_DEVICES=2,3 nohup python icl_ft.py \ 178 | --seed ${seed} \ 179 | --model /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/${model}/${degree}/ \ 180 | --num_gpu ${num_gpu} \ 181 | --dtype ${dtype} \ 182 | --num_icl_shot ${num_icl_shot} \ 183 | --num_gen ${num_gen} \ 184 | --temperature ${temperature} \ 185 | --max_new_token 1024 \ 186 | --top_p 0.95 \ 187 | --swap_space ${swap_space} \ 188 | --code_type ${code_type} \ 189 | --degree ${degree} \ 190 | --debug_mode ${debug_mode} \ 191 | --chkpt ${chkpt} \ 192 | > log/inference/tmp/${model}_${degree}_mod_chkpt${chkpt}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 & 193 | wait && 194 | echo done! -------------------------------------------------------------------------------- /codecontests/data/monolithic_2shot_demonstration_169seed.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_description": ["A prime number is a number which has exactly two distinct divisors: one and itself. For example, numbers 2, 7, 3 are prime, and 1, 6, 4 are not.\n\nThe next prime number after x is the smallest prime number greater than x. For example, the next prime number after 2 is 3, and the next prime number after 3 is 5. Note that there is exactly one next prime number after each number. So 5 is not the next prime number for 2.\n\nOne cold April morning Panoramix predicted that soon Kakofonix will break free from his straitjacket, and this will be a black day for the residents of the Gallic countryside.\n\nPanoramix's prophecy tells that if some day Asterix and Obelix beat exactly x Roman soldiers, where x is a prime number, and next day they beat exactly y Roman soldiers, where y is the next prime number after x, then it's time to wait for Armageddon, for nothing can shut Kakofonix up while he sings his infernal song.\n\nYesterday the Gauls beat n Roman soldiers and it turned out that the number n was prime! Today their victims were a troop of m Romans (m > n). Determine whether the Gauls should wait for the black day after today's victory of Asterix and Obelix?\n\nInput\n\nThe first and only input line contains two positive integers \u2014 n and m (2 \u2264 n < m \u2264 50). It is guaranteed that n is prime.\n\nPretests contain all the cases with restrictions 2 \u2264 n < m \u2264 4.\n\nOutput\n\nPrint YES, if m is the next prime number after n, or NO otherwise.\n\nExamples\n\nInput\n\n3 5\n\n\nOutput\n\nYES\n\nInput\n\n7 11\n\n\nOutput\n\nYES\n\nInput\n\n7 9\n\n\nOutput\n\nNO", "A bracket sequence is a string that is one of the following:\n\n1. An empty string;\n2. The concatenation of `(`, A, and `)` in this order, for some bracket sequence A ;\n3. The concatenation of A and B in this order, for some non-empty bracket sequences A and B /\n\n\n\nGiven are N strings S_i. Can a bracket sequence be formed by concatenating all the N strings in some order?\n\nConstraints\n\n* 1 \\leq N \\leq 10^6\n* The total length of the strings S_i is at most 10^6.\n* S_i is a non-empty string consisting of `(` and `)`.\n\nInput\n\nInput is given from Standard Input in the following format:\n\n\nN\nS_1\n:\nS_N\n\n\nOutput\n\nIf a bracket sequence can be formed by concatenating all the N strings in some order, print `Yes`; otherwise, print `No`.\n\nExamples\n\nInput\n\n2\n)\n(()\n\n\nOutput\n\nYes\n\n\nInput\n\n2\n)(\n()\n\n\nOutput\n\nNo\n\n\nInput\n\n4\n((()))\n((((((\n))))))\n()()()\n\n\nOutput\n\nYes\n\n\nInput\n\n3\n(((\n)\n)\n\n\nOutput\n\nNo"], "public_tests": [{"input": ["7 9\n", "3 5\n", "7 11\n"], "output": ["NO\n", "YES\n", "YES\n"]}, {"input": ["3\n(((\n)\n)", "2\n)\n(()", "4\n((()))\n((((((\n))))))\n()()()", "2\n)(\n()"], "output": ["No", "Yes", "Yes", "No"]}], "private_tests": [{"input": ["2 6\n", "31 33\n", "2 11\n", "41 49\n", "13 17\n", "23 29\n", "7 8\n", "5 13\n", "47 50\n", "43 47\n", "17 19\n", "5 9\n", "2 50\n", "2 3\n", "3 7\n", "13 20\n", "11 13\n", "19 23\n", "5 11\n", "3 9\n", "5 6\n", "23 25\n", "43 49\n", "5 7\n", "3 4\n", "7 13\n", "3 6\n", "37 41\n", "13 15\n", "2 7\n", "5 15\n", "47 48\n", "2 5\n", "31 37\n", "29 31\n", "19 21\n", "2 4\n", "41 43\n", "47 49\n"], "output": ["NO\n", "NO\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "YES\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "NO\n", "NO\n", "NO\n", "YES\n", "NO\n", "NO\n", "NO\n", "YES\n", "NO\n", "NO\n", "NO\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "YES\n", "NO\n"]}, {"input": [], "output": []}], "transformed_sc": ["n, m = map(int, input().split())\nfound = False\nnum = n + 1\nwhile True:\n if num < 2:\n prime = False\n else:\n prime = True\n for i in range(2, int(num**0.5) + 1):\n if num % i == 0:\n prime = False\n break\n if prime:\n if num == m:\n print(\"YES\")\n found = True\n break\n num += 1\nif not found:\n print(\"NO\")", "import sys\n\n\nn = int(input())\ns = [list(input()) for _ in range(n)]\n\nct1_total = 0\nct2_total = 0\nL = []\n\nfor i in range(n):\n ct1 = 0\n ct2 = 0\n\n for char in s[i]:\n if char == '(':\n ct1 += 1\n else:\n ct2 += 1\n \n ct1_total += ct1\n ct2_total += ct2\n \n ct1 = 0\n ct2 = 0\n l = [0]\n\n for char in s[i]:\n if char == '(':\n ct1 += 1\n l.append(ct1)\n else:\n ct2 += 1\n l.append(-ct2)\n \n L.append(l)\n\nif ct1_total != ct2_total:\n result = 'No'\n print(result)\n sys.exit()\n\nL1 = []\nL2 = []\n\nfor l in L:\n if l[-1] >= 0:\n L1.append((min(l), l[-1]))\n else:\n L2.append((min(l) - l[-1], -l[-1]))\n\nL1.sort()\nL1.reverse()\nct4 = 0\n\nresult1 = ''\nfor i in range(len(L1)):\n if ct4 + L1[i][0] < 0:\n result1 = 'No'\n ct4 += L1[i][1]\n\nif result1 == '':\n result1 = 'Yes'\n\nL2.sort()\nL2.reverse()\nct4 = 0\n\nresult2 = ''\nfor i in range(len(L2)):\n if ct4 + L2[i][0] < 0:\n result2 = 'No'\n ct4 += L2[i][1]\n\nif result2 == '':\n result2 = 'Yes'\n\nif result1 == 'Yes' and result2 == 'Yes':\n result = 'Yes'\nelse:\n result = 'No'\n\nprint(result)"], "sc": ["n,m=map(int,input().split())\ni=n+1\nfor i in range(i,m+1):\n t=0\n for j in range(2,i):\n if(i%j==0):\n t=1\n break\n\n if((i==m and t==1 )or t==0 and i!=m):\n print(\"NO\")\n break\n elif(i==m and t==0):\n print(\"YES\")\n break\n else:\n continue\n \n \n \n", "import sys\nn=int(input())\ns=[list(input()) for i in range(n)]\nL1=[]\nL2=[]\nct1=0;ct2=0\nfor i in range(n):\n ct3=0\n l=[0]\n for j in range(len(s[i])):\n if s[i][j]=='(':\n ct1+=1\n ct3+=1\n l.append(ct3)\n else:\n ct2+=1\n ct3-=1\n l.append(ct3)\n if l[-1]>=0:\n L1.append((min(l),l[-1]))\n else:\n L2.append((min(l)-l[-1],-l[-1]))\nif ct1!=ct2:\n print('No')\n sys.exit()\n\nL1.sort()\nL1.reverse()\nct4=0\nfor i in range(len(L1)):\n if ct4+L1[i][0]<0:\n print('No')\n sys.exit()\n ct4+=L1[i][1]\n\nL2.sort()\nL2.reverse()\nct5=0\nfor i in range(len(L2)):\n if ct5+L2[i][0]<0:\n print('No')\n sys.exit()\n ct5+=L2[i][1]\n\nprint('Yes')"], "sc_cc": [10.0, 11.0], "transformed_mc": ["\ndef is_prime(num):\n if num < 2:\n return False\n for i in range(2, int(num**0.5) + 1):\n if num % i == 0:\n return False\n return True\n\ndef find_next_prime(n):\n i = n + 1\n while True:\n if is_prime(i):\n return i\n i += 1\n\ndef check_for_black_day(n, m):\n next_prime = find_next_prime(n)\n if next_prime == m:\n return \"YES\"\n else:\n return \"NO\"\n\ndef main():\n n, m = map(int, input().split())\n result = check_for_black_day(n, m)\n print(result)\n\nif __name__ == '__main__':\n main()\n", "import sys\n\ndef read_input():\n n = int(input())\n s = [list(input()) for _ in range(n)]\n return n, s\n\ndef count_brackets(s):\n ct1 = 0\n ct2 = 0\n\n for char in s:\n if char == '(':\n ct1 += 1\n else:\n ct2 += 1\n\n return ct1, ct2\n\ndef process_string(s):\n ct1 = 0\n ct2 = 0\n l = [0]\n\n for char in s:\n if char == '(':\n ct1 += 1\n l.append(ct1)\n else:\n ct2 += 1\n l.append(-ct2)\n\n return l\n\ndef check_valid_sequences(L):\n L1 = []\n L2 = []\n\n for l in L:\n if l[-1] >= 0:\n L1.append((min(l), l[-1]))\n else:\n L2.append((min(l) - l[-1], -l[-1]))\n\n return L1, L2\n\ndef check_sequence_order(L, ct):\n L.sort()\n L.reverse()\n ct4 = 0\n\n for i in range(len(L)):\n if ct4 + L[i][0] < 0:\n return 'No'\n ct4 += L[i][1]\n\n return 'Yes'\n\ndef check_bracket_sequence(n, s):\n ct1_total = 0\n ct2_total = 0\n L = []\n\n for i in range(n):\n ct1, ct2 = count_brackets(s[i])\n ct1_total += ct1\n ct2_total += ct2\n l = process_string(s[i])\n L.append(l)\n\n if ct1_total != ct2_total:\n return 'No'\n\n L1, L2 = check_valid_sequences(L)\n\n result1 = check_sequence_order(L1, ct1_total)\n result2 = check_sequence_order(L2, ct2_total)\n\n if result1 == 'Yes' and result2 == 'Yes':\n return 'Yes'\n else:\n return 'No'\n\ndef main():\n n, s = read_input()\n result = check_bracket_sequence(n, s)\n print(result)\n\nif __name__ == '__main__':\n main()"]} 2 | -------------------------------------------------------------------------------- /codecontests/construct_demonstration_for_correlation_experiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Calculate code properties among 10% of original data and save" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import random\n", 18 | "from datasets import Dataset\n", 19 | "from utils.utils import get_code_style_score, get_code_modularity_score, read_jsonl_to_dict, write_dict_to_jsonl\n", 20 | "\n", 21 | "\n", 22 | "def compute_code_score(example):\n", 23 | " code = example['code']\n", 24 | " try:\n", 25 | " score_style = get_code_style_score(code)\n", 26 | " score_modularity = get_code_modularity_score(code)\n", 27 | " except Exception:\n", 28 | " score_style = {\n", 29 | " 'score_var': -1.0,\n", 30 | " 'score_pep8': -1.0,\n", 31 | " 'score_style': -1.0,\n", 32 | " }\n", 33 | " score_modularity = -1.0\n", 34 | "\n", 35 | " example['score_style'] = score_style\n", 36 | " example['score_modularity'] = score_modularity\n", 37 | " return example\n", 38 | "\n", 39 | "\n", 40 | "def check_code_score(example):\n", 41 | " return example['score_style']['score_var'] >= 0 and example['score_style']['score_pep8'] >= 0 and example['score_modularity'] >= 0\n", 42 | "\n", 43 | "\n", 44 | "dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))\n", 45 | "demonstration = []\n", 46 | "\n", 47 | "# aggregate demonstration code\n", 48 | "# keys for dataset: dict_keys(['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file'])\n", 49 | "# keys for solutions: dict_keys(['cc', 'modules', 'passed', 'solution'])\n", 50 | "for data in dataset:\n", 51 | " for i in range(len(data['solutions']['solution'])):\n", 52 | " if data['solutions']['passed'][i]:\n", 53 | " demonstration.append(\n", 54 | " {\n", 55 | " 'description': data['description'],\n", 56 | " 'code': data['solutions']['solution'][i],\n", 57 | " # more information?\n", 58 | " }\n", 59 | " )\n", 60 | "\n", 61 | "# calculate code metrics\n", 62 | "random.seed(42)\n", 63 | "demonstration = random.sample(demonstration, len(demonstration) // 10) # 10% of total data\n", 64 | "demonstration = Dataset.from_list(demonstration)\n", 65 | "demonstration = demonstration.map(compute_code_score, num_proc=16)\n", 66 | "demonstration = demonstration.filter(check_code_score, num_proc=16)\n", 67 | "\n", 68 | "# save\n", 69 | "# demonstration.save_to_disk(os.path.join(os.getcwd(), 'data', 'demonstration'))\n", 70 | "write_dict_to_jsonl(list(demonstration), os.path.join(os.getcwd(), 'data', 'demonstration.jsonl'))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Get 100 demonstrations of particular code property with evenness" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "import os\n", 87 | "import random\n", 88 | "import numpy as np\n", 89 | "import pandas as pd\n", 90 | "import matplotlib.pyplot as plt\n", 91 | "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score, get_code_modularity_score, get_average_length_of_variables\n", 92 | "\n", 93 | "\n", 94 | "random.seed(42) # for reproducibility\n", 95 | "num_sample = 10 # number of samples to be sampled from each bin\n", 96 | "\n", 97 | "# load demonstration pool\n", 98 | "# each data consists of (problem description, code, style score, modularity score)\n", 99 | "file_name = 'demonstration'\n", 100 | "path = f'/home/kdy20401/Workspace/Proj-Code-Generation/MC/data/{file_name}.jsonl'\n", 101 | "demonstration = read_jsonl_to_dict(path)\n", 102 | "print(f'number of codes in demonstration pool: {len(demonstration)}')\n", 103 | "\n", 104 | "code = []\n", 105 | "style = [] # score_pep8\n", 106 | "modularity = [] # score_modularity\n", 107 | "var_len = []\n", 108 | "for data in demonstration:\n", 109 | " code.append(data['code'])\n", 110 | " style.append(data['score_style']['score_pep8'])\n", 111 | " modularity.append(data['score_modularity'])\n", 112 | " var_len.append(get_average_length_of_variables(data['code']))\n", 113 | "\n", 114 | "style_df = pd.DataFrame({'style': np.array(style)})\n", 115 | "modularity_df = pd.DataFrame({'modularity': np.array(modularity)})\n", 116 | "var_len_df = pd.DataFrame({'var_len': np.array(var_len)})\n", 117 | "\n", 118 | "# bins: 0~0.1, 0.1~0.2, ..., 0.9~1.0\n", 119 | "num_bin = 10\n", 120 | "bins = np.linspace(0, 1, num_bin + 1)\n", 121 | "\n", 122 | "# find the grid cell to which each data point belongs\n", 123 | "# include_lowest=True makes 0 style or modularity value included in the first bin\n", 124 | "# style_df['style_bin'] = pd.cut(style_df['style'], bins=bins, labels=False, include_lowest=True)\n", 125 | "# modularity_df['modularity_bin'] = pd.cut(modularity_df['modularity'], bins=bins, labels=False, include_lowest=True)\n", 126 | "var_len_df['var_len_bin'] = pd.cut(var_len_df['var_len'], bins=bins, labels=False, include_lowest=True)\n", 127 | "\n", 128 | "# sample data points from each bin\n", 129 | "# if the number of data points in the bin is less than num_sample, duplication can occur\n", 130 | "# style_sampled_points = style_df.groupby(['style_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n", 131 | "# modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n", 132 | "var_len_sampled_points = var_len_df.groupby(['var_len_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n", 133 | "\n", 134 | "# style_sampled_points.index => (style_bin, code_index)\n", 135 | "# (deduplicated) index of sampled data points \n", 136 | "# style_index = list(set([e[1] for e in style_sampled_points.index]))\n", 137 | "# modularity_index = list(set([e[1] for e in modularity_sampled_points.index]))\n", 138 | "var_len_index = list(set([e[1] for e in var_len_sampled_points.index]))\n", 139 | "\n", 140 | "# the number of samples is less than expected\n", 141 | "# assert len(style_index) == num_bin * num_sample and len(modularity_index) == num_bin * num_sample\n", 142 | "assert len(var_len_index) == num_bin * num_sample\n", 143 | " \n", 144 | "selected_demonstration_by_style = [demonstration[i] for i in style_index]\n", 145 | "selected_demonstration_by_modularity = [demonstration[i] for i in modularity_index]\n", 146 | "selected_demonstration_by_var_len = [demonstration[i] for i in var_len_index]\n", 147 | "\n", 148 | "# save each demonstration which has high coverage of style or modularity\n", 149 | "# write_dict_to_jsonl(selected_demonstration_by_style, os.path.join(os.getcwd(), 'data', 'style_demonstration.jsonl'))\n", 150 | "# write_dict_to_jsonl(selected_demonstration_by_modularity, os.path.join(os.getcwd(), 'data', 'modularity_demonstration.jsonl'))\n", 151 | "write_dict_to_jsonl(selected_demonstration_by_var_len, os.path.join(os.getcwd(), 'data', 'var_len_demonstration.jsonl'))\n", 152 | "\n", 153 | "# for visualization\n", 154 | "# plt.scatter(style_sampled_points['style'], np.array([0.5] * len(style_sampled_points)), color='red', label='Sampled Data')\n", 155 | "# plt.scatter(modularity_sampled_points['modularity'], np.array([0.5] * len(modularity_sampled_points)), color='blue', label='Sampled Data')\n", 156 | "# plt.xlabel('Style')\n", 157 | "# plt.ylabel('Modularity (tmp)')\n", 158 | "# plt.legend()\n", 159 | "# plt.show() " 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "language_info": { 165 | "name": "python" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 2 170 | } 171 | -------------------------------------------------------------------------------- /apps/data/2shot_demonstration_101seed.json: -------------------------------------------------------------------------------- 1 | {"problem_id":979,"problem_description":"You are given a grid of size M x N, where each square is colored with some random color among K colors with each having equal probability.\n\nA Good Rectangle is defined as one where all squares lying on the inner border are of the same color.\n\nWhat is the expected number of Good Rectangles in the given grid.\n\n-----Input-----\n\n- \nFirst Line contains M, N, K\n\n-----Output-----\nA single value rounded off to the nearest Integer corresponding to the required answer.\n\n-----Constraints-----\n- 1 <= N <= 105 \n- 1 <= M <= 105 \n- 1 <= K <= 105 \n\n-----Example-----\nInput:\n1 3 1\nOutput:\n6","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"n, m, k = list(map(int, input().split()))\n\nif k == 1:\n x, y = 0, 0\n for p in range(2, n + 1):\n x += (n - p + 1)\n for p in range(2, m + 1):\n y += (m - p + 1)\n ans = x * y\n x = 0\n for p in range(1, n + 1):\n x += (n - p + 1)\n y = 0\n for p in range(1, m + 1):\n y += (m - p + 1)\n ans += m * x\n ans += n * y\n ans -= n * m\n print(ans)\nelse:\n x, y = 0.0, 0.0\n q = 1.0\n for p in range(2, n + 1):\n q \/= k * k\n x += (n - p + 1) * q\n for p in range(2, m + 1):\n q \/= k * k\n y += (m - p + 1) * q\n ans = k * x * y\n x = 0.0\n q = 1.0\n for p in range(1, n + 1):\n x += (n - p + 1) * q\n q \/= k\n y = 0.0\n q = 1.0\n for p in range(1, m + 1):\n y += (m - p + 1) * q\n q \/= k\n ans += m * x\n ans += n * y\n ans -= n * m\n ans += 1e-9\n \n print(\"%.0f\" % ans)","sc_cc":10.0,"mc":"def for1(M,k):\n ret = 0.0\n x = k*k+0.0\n z=x\n for m in range(1,M):\n ret+=(M-m)\/x\n x*=z\n return ret \n \ndef for2(M,k):\n ret = 0.0\n x = k+0.0\n for m in range(1,M):\n ret+=(M-m)\/x\n \n x*=k\n return ret \n \ndef ans(M,N,K):\n\n return int(round(M*N+M*for2(N,K)+N*for2(M,K)+K*for1(M,K)*for1(N,K),0))\nM,N,K = list(map(int,input().split()))\nprint(ans(M,N,K))","mc_cc":1.5,"transformed_mc":["\ndef calculate_good_rectangles(n, m, k):\n if k == 1:\n return calculate_good_rectangles_k1(n, m)\n else:\n return calculate_good_rectangles_k(n, m, k)\n\ndef calculate_good_rectangles_k1(n, m):\n x, y = 0, 0\n for p in range(2, n + 1):\n x += (n - p + 1)\n for p in range(2, m + 1):\n y += (m - p + 1)\n ans = x * y\n x = 0\n for p in range(1, n + 1):\n x += (n - p + 1)\n y = 0\n for p in range(1, m + 1):\n y += (m - p + 1)\n ans += m * x\n ans += n * y\n ans -= n * m\n return ans\n\ndef calculate_good_rectangles_k(n, m, k):\n x, y = 0.0, 0.0\n q = 1.0\n for p in range(2, n + 1):\n q \/= k * k\n x += (n - p + 1) * q\n for p in range(2, m + 1):\n q \/= k * k\n y += (m - p + 1) * q\n ans = k * x * y\n x = 0.0\n q = 1.0\n for p in range(1, n + 1):\n x += (n - p + 1) * q\n q \/= k\n y = 0.0\n q = 1.0\n for p in range(1, m + 1):\n y += (m - p + 1) * q\n q \/= k\n ans += m * x\n ans += n * y\n ans -= n * m\n ans += 1e-9\n return ans\n\ndef main():\n n, m, k = list(map(int, input().split()))\n ans = calculate_good_rectangles(n, m, k)\n print(\"%.0f\" % ans)\n\nif __name__ == '__main__':\n main()\n"],"transformed_sc":["n, m, k = list(map(int, input().split()))\nif k == 1:\n x, y = 0, 0\n for p in range(2, n + 1):\n x += (n - p + 1)\n for p in range(2, m + 1):\n y += (m - p + 1)\n ans = x * y\n x = 0\n for p in range(1, n + 1):\n x += (n - p + 1)\n y = 0\n for p in range(1, m + 1):\n y += (m - p + 1)\n ans += m * x\n ans += n * y\n ans -= n * m\nelse:\n x, y = 0.0, 0.0\n q = 1.0\n for p in range(2, n + 1):\n q \/= k * k\n x += (n - p + 1) * q\n for p in range(2, m + 1):\n q \/= k * k\n y += (m - p + 1) * q\n ans = k * x * y\n x = 0.0\n q = 1.0\n for p in range(1, n + 1):\n x += (n - p + 1) * q\n q \/= k\n y = 0.0\n q = 1.0\n for p in range(1, m + 1):\n y += (m - p + 1) * q\n q \/= k\n ans += m * x\n ans += n * y\n ans -= n * m\n ans += 1e-9\nprint(\"%.0f\" % ans)"]} 2 | {"problem_id":2109,"problem_description":"10^{10^{10}} participants, including Takahashi, competed in two programming contests.\nIn each contest, all participants had distinct ranks from first through 10^{10^{10}}-th.\nThe score of a participant is the product of his\/her ranks in the two contests.\nProcess the following Q queries:\n - In the i-th query, you are given two positive integers A_i and B_i. Assuming that Takahashi was ranked A_i-th in the first contest and B_i-th in the second contest, find the maximum possible number of participants whose scores are smaller than Takahashi's.\n\n-----Constraints-----\n - 1 \\leq Q \\leq 100\n - 1\\leq A_i,B_i\\leq 10^9(1\\leq i\\leq Q)\n - All values in input are integers.\n\n-----Input-----\nInput is given from Standard Input in the following format:\nQ\nA_1 B_1\n:\nA_Q B_Q\n\n-----Output-----\nFor each query, print the maximum possible number of participants whose scores are smaller than Takahashi's.\n\n-----Sample Input-----\n8\n1 4\n10 5\n3 3\n4 11\n8 9\n22 40\n8 36\n314159265 358979323\n\n-----Sample Output-----\n1\n12\n4\n11\n14\n57\n31\n671644785\n\nLet us denote a participant who was ranked x-th in the first contest and y-th in the second contest as (x,y).\nIn the first query, (2,1) is a possible candidate of a participant whose score is smaller than Takahashi's. There are never two or more participants whose scores are smaller than Takahashi's, so we should print 1.","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"import math\nq=int(input())\nfor j in range(q):\n\tAB=[int(i) for i in input().split()]\n\tAB.sort()\n\tab=AB[0]*AB[1] \n\tfor i in range(int(math.sqrt(ab)),AB[1]+1):\n\t\tif i*i>=ab:\n\t\t\tx=i-1\n\t\t\tbreak\n\tif x==0:\n\t\tprint(0)\n\t\tcontinue\n\tfor i in range(int(ab\/x),ab+1):\n\t\tif x*i>=ab:\n\t\t\ty=i-1\n\t\t\tbreak\n\tcnt1=2*x\n\tcnts=1 if x==y else 0\n\tcntdd=1 if x-AB[0]>=0 else 0\n\tcntdu=1 if AB[1]-y>=0 and (AB[1]+1)*(x-AB[1]+y)>=ab else 0 \n\tprint(cnt1-cnts-cntdd-cntdu)","sc_cc":12.0,"mc":"import sys\n\ninput = sys.stdin.readline\nQ = int(input())\n\ndef max_score(x, a):\n ret = 0\n for p in [(x-4), (x-2), x, (x+2), x+4]:\n p += x%2\n p \/\/= 2\n ret = max(ret, (p+(p>=a))*(x-p+1))\n return ret\n\nfor _ in range(Q):\n a, b = list(map(int, input().split()))\n a, b = min(a, b), max(a, b)\n\n def is_ok(x):\n return a*b > max_score(x, a)\n\n def bisect(ng, ok):\n while (abs(ok - ng) > 1):\n mid = (ok + ng) \/\/ 2\n if is_ok(mid):\n ok = mid\n else:\n ng = mid\n return ok\n ans = bisect(2*b+1, a-1)\n print(ans)","mc_cc":2.0,"transformed_mc":["\nimport math\n\ndef calculate_max_participants(AB):\n AB.sort()\n ab = AB[0] * AB[1]\n \n x = calculate_x(ab, AB)\n if x == 0:\n return 0\n \n y = calculate_y(ab, x)\n \n cnt1 = 2 * x\n cnts = 1 if x == y else 0\n cntdd = 1 if x - AB[0] >= 0 else 0\n cntdu = 1 if AB[1] - y >= 0 and (AB[1] + 1) * (x - AB[1] + y) >= ab else 0\n \n return cnt1 - cnts - cntdd - cntdu\n\ndef calculate_x(ab, AB):\n for i in range(int(math.sqrt(ab)), AB[1] + 1):\n if i * i >= ab:\n return i - 1\n return 0\n\ndef calculate_y(ab, x):\n for i in range(int(ab \/ x), ab + 1):\n if x * i >= ab:\n return i - 1\n\ndef main():\n q = int(input())\n for _ in range(q):\n AB = [int(i) for i in input().split()]\n result = calculate_max_participants(AB)\n print(result)\n\nif __name__ == '__main__':\n main()\n"],"transformed_sc":["import math\n\nq = int(input())\nfor _ in range(q):\n AB = [int(i) for i in input().split()]\n AB.sort()\n ab = AB[0] * AB[1]\n\n x = 0\n for i in range(int(math.sqrt(ab)), AB[1] + 1):\n if i * i >= ab:\n x = i - 1\n break\n if x == 0:\n result = 0\n continue\n\n for i in range(int(ab \/ x), ab + 1):\n if x * i >= ab:\n y = i - 1\n break\n \n cnt1 = 2 * x\n cnts = 1 if x == y else 0\n cntdd = 1 if x - AB[0] >= 0 else 0\n cntdu = 1 if AB[1] - y >= 0 and (AB[1] + 1) * (x - AB[1] + y) >= ab else 0\n result = cnt1 - cnts - cntdd - cntdu\n print(result)"]} 3 | -------------------------------------------------------------------------------- /codecontests/calculate_corr_between_mos_and_function_call.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Calculate code properties among 10% of original data and save" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "/data/kdy20401/.conda/envs/mc/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 20 | " from .autonotebook import tqdm as notebook_tqdm\n", 21 | "Map (num_proc=16): 100%|██████████| 126447/126447 [01:02<00:00, 2013.54 examples/s]\n", 22 | "Filter (num_proc=16): 100%|██████████| 126447/126447 [00:06<00:00, 19519.37 examples/s]\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "import os\n", 28 | "import random\n", 29 | "from datasets import Dataset\n", 30 | "from utils.utils import get_code_style_score, get_code_modularity_score, read_jsonl_to_dict, write_dict_to_jsonl\n", 31 | "\n", 32 | "\n", 33 | "def compute_code_score(example):\n", 34 | " code = example['code']\n", 35 | " try:\n", 36 | " score_modularity = get_code_modularity_score(code)\n", 37 | " except Exception:\n", 38 | " score_modularity = -1.0\n", 39 | "\n", 40 | " example['score_modularity'] = score_modularity\n", 41 | " return example\n", 42 | "\n", 43 | "\n", 44 | "def check_code_score(example):\n", 45 | " return example['score_modularity'] >= 0\n", 46 | "\n", 47 | "\n", 48 | "dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))\n", 49 | "demonstration = []\n", 50 | "\n", 51 | "# aggregate demonstration code\n", 52 | "# keys for dataset: dict_keys(['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file'])\n", 53 | "# keys for solutions: dict_keys(['cc', 'modules', 'passed', 'solution'])\n", 54 | "for data in dataset:\n", 55 | " for i in range(len(data['solutions']['solution'])):\n", 56 | " if data['solutions']['passed'][i]:\n", 57 | " demonstration.append(\n", 58 | " {\n", 59 | " 'description': data['description'],\n", 60 | " 'code': data['solutions']['solution'][i],\n", 61 | " # more information?\n", 62 | " }\n", 63 | " )\n", 64 | "\n", 65 | "# calculate MoS\n", 66 | "random.seed(42)\n", 67 | "demonstration = random.sample(demonstration, len(demonstration) // 10) # 10% of total data\n", 68 | "demonstration = Dataset.from_list(demonstration)\n", 69 | "demonstration = demonstration.map(compute_code_score, num_proc=16)\n", 70 | "demonstration = demonstration.filter(check_code_score, num_proc=16)\n", 71 | "\n", 72 | "# save\n", 73 | "write_dict_to_jsonl(list(demonstration), os.path.join(os.getcwd(), 'data', 'demonstration_with_new_modularity.jsonl'))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Get 500 demonstrations" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 19, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "number of codes in demonstration pool: 125659\n" 93 | ] 94 | }, 95 | { 96 | "name": "stderr", 97 | "output_type": "stream", 98 | "text": [ 99 | "/tmp/ipykernel_442372/1728575739.py:34: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", 100 | " modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "import os\n", 106 | "import random\n", 107 | "import numpy as np\n", 108 | "import pandas as pd\n", 109 | "import matplotlib.pyplot as plt\n", 110 | "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score, get_code_modularity_score, get_average_length_of_variables\n", 111 | "\n", 112 | "\n", 113 | "random.seed(27) # for reproducibility\n", 114 | "num_sample = 10 # number of samples to be sampled from each bin\n", 115 | "\n", 116 | "# load demonstration pool\n", 117 | "file_name = 'demonstration_with_new_modularity'\n", 118 | "path = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/{file_name}.jsonl'\n", 119 | "demonstration = read_jsonl_to_dict(path)\n", 120 | "print(f'number of codes in demonstration pool: {len(demonstration)}')\n", 121 | "\n", 122 | "modularity = [] # score_modularity\n", 123 | "for data in demonstration:\n", 124 | " modularity.append(data['score_modularity'])\n", 125 | "\n", 126 | "modularity_df = pd.DataFrame({'modularity': np.array(modularity)})\n", 127 | "\n", 128 | "# bins: 0~0.1, 0.1~0.2, ..., 0.9~1.0\n", 129 | "num_bin = 10\n", 130 | "bins = np.linspace(0, 1, num_bin + 1)\n", 131 | "\n", 132 | "# find the grid cell to which each data point belongs\n", 133 | "# include_lowest=True makes 0 style or modularity value included in the first bin\n", 134 | "modularity_df['modularity_bin'] = pd.cut(modularity_df['modularity'], bins=bins, labels=False, include_lowest=True)\n", 135 | "\n", 136 | "# sample data points from each bin\n", 137 | "# if the number of data points in the bin is less than num_sample, duplication can occur\n", 138 | "modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n", 139 | "\n", 140 | "# style_sampled_points.index => (style_bin, code_index)\n", 141 | "# (deduplicated) index of sampled data points \n", 142 | "modularity_index = list(set([e[1] for e in modularity_sampled_points.index]))\n", 143 | "\n", 144 | "# the number of samples is less than expected\n", 145 | "# assert len(style_index) == num_bin * num_sample and len(modularity_index) == num_bin * num_sample\n", 146 | "assert len(modularity_index) == num_bin * num_sample\n", 147 | " \n", 148 | "selected_demonstration_by_modularity = [demonstration[i] for i in modularity_index]\n", 149 | "\n", 150 | "# save each demonstration which has high coverage of style or modularity\n", 151 | "write_dict_to_jsonl(selected_demonstration_by_modularity, os.path.join(os.getcwd(), 'data', 'modularity_demonstration_with_new_modularity.jsonl'))\n", 152 | "\n", 153 | "# # for visualization\n", 154 | "# plt.scatter(modularity_sampled_points['modularity'], np.array([0.5] * len(modularity_sampled_points)), color='red', label='Sampled Data')\n", 155 | "# plt.xlabel('MoS')\n", 156 | "# plt.ylabel('temp')\n", 157 | "# plt.legend()\n", 158 | "# plt.show() " 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## calculate corr between mos and function calls" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 1, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "from utils.utils import count_num_module_calls\n", 175 | "\n", 176 | "base_directory = os.getcwd()\n", 177 | "\n", 178 | "demonstration_dataset = read_jsonl_to_dict(\n", 179 | " \n", 180 | " os.path.join(\n", 181 | " base_directory,\n", 182 | " \"data\",\n", 183 | " 'modularity_demonstration_with_new_modularity.jsonl',\n", 184 | " ) \n", 185 | ")\n", 186 | "\n", 187 | "import matplotlib.pyplot as plt\n", 188 | "from scipy import stats\n", 189 | "\n", 190 | "mos, function_call = [], []\n", 191 | "for data in demonstration_dataset:\n", 192 | " mos.append(data['score_modularity'])\n", 193 | " function_call.append(count_num_module_calls(data['code']))\n", 194 | " \n", 195 | "pearsonr_stat = stats.pearsonr(mos, function_call)\n", 196 | "pearsonr, pearsonr_p = pearsonr_stat.correlation, pearsonr_stat.pvalue\n", 197 | "spearmanr_stat = stats.spearmanr(mos, function_call)\n", 198 | "spearmanr, spearmanr_p = spearmanr_stat.correlation, spearmanr_stat.pvalue\n", 199 | "\n", 200 | "plt.scatter(mos, function_call, color='red', label='Sampled Data')\n", 201 | "plt.xlabel('MoS')\n", 202 | "plt.ylabel('number of function calls')\n", 203 | "plt.legend()\n", 204 | "plt.show()\n", 205 | "\n", 206 | "print(f'pearsonr: {round(pearsonr, 2)}, pearsonr_p: {round(pearsonr_p, 2)}')\n", 207 | "print(f'spearmanr: {round(spearmanr, 2)}, spearmanr_p: {round(spearmanr_p, 2)}')" 208 | ] 209 | } 210 | ], 211 | "metadata": { 212 | "kernelspec": { 213 | "display_name": "mc", 214 | "language": "python", 215 | "name": "python3" 216 | }, 217 | "language_info": { 218 | "codemirror_mode": { 219 | "name": "ipython", 220 | "version": 3 221 | }, 222 | "file_extension": ".py", 223 | "mimetype": "text/x-python", 224 | "name": "python", 225 | "nbconvert_exporter": "python", 226 | "pygments_lexer": "ipython3", 227 | "version": "3.9.19" 228 | } 229 | }, 230 | "nbformat": 4, 231 | "nbformat_minor": 2 232 | } 233 | -------------------------------------------------------------------------------- /codecontests/data/monolithic_2shot_demonstration_134seed.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_description": ["Nauuo is a girl who loves writing comments.\n\nOne day, she posted a comment on Codeforces, wondering whether she would get upvotes or downvotes.\n\nIt's known that there were x persons who would upvote, y persons who would downvote, and there were also another z persons who would vote, but you don't know whether they would upvote or downvote. Note that each of the x+y+z people would vote exactly one time.\n\nThere are three different results: if there are more people upvote than downvote, the result will be \"+\"; if there are more people downvote than upvote, the result will be \"-\"; otherwise the result will be \"0\".\n\nBecause of the z unknown persons, the result may be uncertain (i.e. there are more than one possible results). More formally, the result is uncertain if and only if there exist two different situations of how the z persons vote, that the results are different in the two situations.\n\nTell Nauuo the result or report that the result is uncertain.\n\nInput\n\nThe only line contains three integers x, y, z (0\u2264 x,y,z\u2264100), corresponding to the number of persons who would upvote, downvote or unknown.\n\nOutput\n\nIf there is only one possible result, print the result : \"+\", \"-\" or \"0\".\n\nOtherwise, print \"?\" to report that the result is uncertain.\n\nExamples\n\nInput\n\n\n3 7 0\n\n\nOutput\n\n\n-\n\nInput\n\n\n2 0 1\n\n\nOutput\n\n\n+\n\nInput\n\n\n1 1 0\n\n\nOutput\n\n\n0\n\nInput\n\n\n0 0 1\n\n\nOutput\n\n\n?\n\nNote\n\nIn the first example, Nauuo would definitely get three upvotes and seven downvotes, so the only possible result is \"-\".\n\nIn the second example, no matter the person unknown downvotes or upvotes, Nauuo would get more upvotes than downvotes. So the only possible result is \"+\".\n\nIn the third example, Nauuo would definitely get one upvote and one downvote, so the only possible result is \"0\".\n\nIn the fourth example, if the only one person upvoted, the result would be \"+\", otherwise, the result would be \"-\". There are two possible results, so the result is uncertain.", "You have a sequence a with n elements 1, 2, 3, ..., k - 1, k, k - 1, k - 2, ..., k - (n - k) (k \u2264 n < 2k).\n\nLet's call as inversion in a a pair of indices i < j such that a[i] > a[j].\n\nSuppose, you have some permutation p of size k and you build a sequence b of size n in the following manner: b[i] = p[a[i]].\n\nYour goal is to find such permutation p that the total number of inversions in b doesn't exceed the total number of inversions in a, and b is lexicographically maximum.\n\nSmall reminder: the sequence of k integers is called a permutation if it contains all integers from 1 to k exactly once.\n\nAnother small reminder: a sequence s is lexicographically smaller than another sequence t, if either s is a prefix of t, or for the first i such that s_i \u2260 t_i, s_i < t_i holds (in the first position that these sequences are different, s has smaller number than t).\n\nInput\n\nThe first line contains a single integer t (1 \u2264 t \u2264 1000) \u2014 the number of test cases.\n\nThe first and only line of each test case contains two integers n and k (k \u2264 n < 2k; 1 \u2264 k \u2264 10^5) \u2014 the length of the sequence a and its maximum.\n\nIt's guaranteed that the total sum of k over test cases doesn't exceed 10^5.\n\nOutput\n\nFor each test case, print k integers \u2014 the permutation p which maximizes b lexicographically without increasing the total number of inversions.\n\nIt can be proven that p exists and is unique.\n\nExample\n\nInput\n\n\n4\n1 1\n2 2\n3 2\n4 3\n\n\nOutput\n\n\n1 \n1 2 \n2 1 \n1 3 2 \n\nNote\n\nIn the first test case, the sequence a = [1], there is only one permutation p = [1].\n\nIn the second test case, the sequence a = [1, 2]. There is no inversion in a, so there is only one permutation p = [1, 2] which doesn't increase the number of inversions.\n\nIn the third test case, a = [1, 2, 1] and has 1 inversion. If we use p = [2, 1], then b = [p[a[1]], p[a[2]], p[a[3]]] = [2, 1, 2] and also has 1 inversion.\n\nIn the fourth test case, a = [1, 2, 3, 2], and since p = [1, 3, 2] then b = [1, 3, 2, 3]. Both a and b have 1 inversion and b is the lexicographically maximum."], "public_tests": [{"input": ["3 7 0\n", "1 1 0\n", "0 0 1\n", "2 0 1\n"], "output": ["-", "0", "?", "+"]}, {"input": ["4\n1 1\n2 2\n3 2\n4 3\n"], "output": ["\n1 \n1 2 \n2 1 \n1 3 2 \n"]}], "private_tests": [{"input": ["100 0 100\n", "80 63 18\n", "25 12 100\n", "80 29 11\n", "10 5 6\n", "94 37 25\n", "98 82 13\n", "21 24 18\n", "1 2 2\n", "88 88 0\n", "73 29 43\n", "58 83 39\n", "97 33 19\n", "1 3 4\n", "100 100 0\n", "62 63 12\n", "99 20 7\n", "21 52 5\n", "43 9 61\n", "45 0 44\n", "7 4 4\n", "100 100 100\n", "34 51 3\n", "0 0 100\n", "3 3 2\n", "34 44 21\n", "87 98 19\n", "60 60 32\n", "22 99 77\n", "28 99 70\n", "33 24 13\n", "79 42 12\n", "48 100 48\n", "58 97 4\n", "52 14 10\n", "12 1 11\n", "5 2 10\n", "93 21 2\n", "8 5 5\n", "58 83 8\n", "97 64 6\n", "49 8 6\n", "13 6 8\n", "82 98 93\n", "7 4 3\n", "37 5 15\n", "100 0 99\n", "21 50 0\n", "0 100 48\n", "5 7 1\n", "42 40 4\n", "36 3 35\n", "8 87 7\n", "21 55 9\n", "0 0 0\n", "78 95 14\n", "0 100 99\n", "25 39 32\n", "89 41 36\n", "82 84 16\n", "25 35 23\n", "47 78 6\n", "42 43 16\n", "1 1 1\n", "1 0 1\n", "43 93 9\n", "3 4 5\n", "92 93 10\n", "0 87 13\n", "1 50 50\n", "100 0 48\n", "13 1 13\n", "19 90 4\n", "2 2 1\n", "98 44 17\n", "2 1 3\n", "2 82 17\n", "40 51 11\n", "83 3 8\n", "96 71 19\n", "62 56 5\n", "21 31 14\n", "50 100 50\n", "0 100 0\n", "96 55 0\n", "26 92 6\n", "6 5 4\n", "97 71 36\n", "74 2 16\n", "66 27 9\n", "47 40 10\n", "7 3 5\n", "3 2 3\n", "5 1 6\n", "86 1 0\n", "46 1 89\n", "5 3 3\n", "60 33 15\n", "4 3 1\n", "12 89 2\n", "5 5 3\n", "9 8 2\n", "100 48 48\n", "97 78 2\n", "1 2 7\n", "2 87 10\n", "15 4 15\n", "58 58 1\n", "5 3 2\n", "100 50 50\n"], "output": ["?", "?", "?", "+", "?", "+", "+", "?", "?", "0", "+", "?", "+", "?", "0", "?", "+", "-", "?", "+", "?", "?", "-", "?", "?", "?", "?", "?", "?", "-", "?", "+", "-", "-", "+", "?", "?", "+", "?", "-", "+", "+", "?", "?", "?", "+", "+", "-", "-", "-", "?", "?", "-", "-", "0", "-", "-", "?", "+", "?", "?", "-", "?", "?", "?", "-", "?", "?", "-", "?", "+", "?", "-", "?", "+", "?", "-", "?", "+", "+", "+", "?", "?", "-", "+", "-", "?", "?", "+", "+", "?", "?", "?", "?", "+", "?", "?", "+", "?", "-", "?", "?", "+", "+", "?", "-", "?", "?", "?", "?"]}, {"input": [], "output": []}], "transformed_sc": ["x, y, z = map(int, input().split())\nif (x+z) == y and (z+y) == x:\n print(\"0\")\nelif (x+z) >= y and (z+y) >= x:\n print(\"?\")\nelif x > y or (x+z) > y and (y+z) < x:\n print(\"+\")\nelif y > x or (y+z) > x and (x+z) < y:\n print(\"-\")", "test_cases = int(input())\nfor _ in range(test_cases):\n n, k = map(int, input().split())\n r = [y + 1 for y in range(k)]\n sequence = []\n h = 1\n t = 0\n for _ in range(n):\n if t == 0:\n sequence.append(h)\n h += 1\n if h > k:\n h = k - 1\n t = -1\n else:\n sequence.append(h)\n h -= 1\n if h <= 0:\n h = 1\n t = 0\n\n freq_map = {}\n for num in sequence:\n if num not in freq_map:\n freq_map[num] = 1\n else:\n freq_map[num] += 1\n\n p = max(freq_map.keys())\n for num, freq in freq_map.items():\n if freq > 1:\n p = num - 1\n break\n \n z = r[p:]\n z.sort(reverse=True)\n result = r[:p] + z\n\n print(*result)"], "sc": ["x,y,z=input().split()\nx=int(x)\ny=int(y)\nz=int(z)\nif (x+z)==y and (z+y)==x :\n print(\"0\")\nelif (x+z)>=y and (z+y)>=x:\n print(\"?\")\nelif x>y or (x+z)>y and (y+z)x or (y+z)>x and (x+z) c:\n h = c-1\n t = -1\n\n else:\n d.append(h)\n h -= 1\n\n if h <= 0:\n h = 1\n t = 0\n\n n = {}\n for y in d:\n if n.get(y) == None:\n n[y] = 1\n else:\n n[y] += 1\n\n p = c\n for y in n:\n if n[y] > 1:\n p = y-1\n break\n\n z = r[p:]\n z.sort(reverse=True)\n ss = (r[:p]+z)\n print(*ss)\n\n"], "sc_cc": [11.0, 11.0], "transformed_mc": ["\ndef determine_vote_result(x, y, z):\n if (x+z) == y and (z+y) == x:\n return \"0\"\n elif (x+z) >= y and (z+y) >= x:\n return \"?\"\n elif x > y or (x+z) > y and (y+z) < x:\n return \"+\"\n elif y > x or (y+z) > x and (x+z) < y:\n return \"-\"\n\ndef main():\n x, y, z = map(int, input().split())\n result = determine_vote_result(x, y, z)\n print(result)\n\nif __name__ == '__main__':\n main()\n", "def build_sequence(n, k):\n d = []\n h = 1\n t = 0\n for _ in range(n):\n if t == 0:\n d.append(h)\n h += 1\n if h > k:\n h = k - 1\n t = -1\n else:\n d.append(h)\n h -= 1\n if h <= 0:\n h = 1\n t = 0\n return d\n\ndef count_frequency(sequence):\n freq_map = {}\n for num in sequence:\n if num not in freq_map:\n freq_map[num] = 1\n else:\n freq_map[num] += 1\n return freq_map\n\ndef find_p_value(freq_map):\n p = max(freq_map.keys())\n for num, freq in freq_map.items():\n if freq > 1:\n p = num - 1\n break\n return p\n\ndef sort_and_combine(r, p, k):\n z = r[p:]\n z.sort(reverse=True)\n result = r[:p] + z\n return result\n\ndef find_permutation(n, k):\n r = [y + 1 for y in range(k)]\n sequence = build_sequence(n, k)\n freq_map = count_frequency(sequence)\n p = find_p_value(freq_map)\n result = sort_and_combine(r, p, k)\n return result\n\ndef main():\n test_cases = int(input())\n for _ in range(test_cases):\n n, k = map(int, input().split())\n result = find_permutation(n, k)\n print(*result)\n\nif __name__ == '__main__':\n main()"]} 2 | -------------------------------------------------------------------------------- /apps/icl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import argparse 4 | from tqdm import tqdm 5 | 6 | import numpy as np 7 | from collections import defaultdict 8 | 9 | import torch 10 | 11 | from datasets import load_dataset, Dataset 12 | 13 | from vllm import LLM, SamplingParams 14 | 15 | from utils import read_jsonl_to_dict, write_dict_to_jsonl, get_avg_cc 16 | 17 | import sys 18 | 19 | 20 | def set_seed(seed): 21 | random.seed(seed) 22 | np.random.seed(seed) 23 | torch.manual_seed(seed) 24 | torch.cuda.manual_seed(seed) 25 | # When running on the CuDNN backend, two further options must be set 26 | torch.backends.cudnn.deterministic = True 27 | torch.backends.cudnn.benchmark = False 28 | # Set a fixed value for the hash seed 29 | os.environ["PYTHONHASHSEED"] = str(seed) 30 | 31 | 32 | def get_transformed_demonstration(args, data): 33 | demonstration = defaultdict(list) 34 | 35 | for i in range(args.num_icl_shot): 36 | if "sc" in args.code_type: 37 | instruction = data["sc_instruction"][i] 38 | else: 39 | instruction = data["mc_instruction"][i] 40 | 41 | if "transformed" in args.code_type: 42 | code = data[args.code_type][i][0].strip() 43 | else: 44 | code = data[args.code_type][i].strip() 45 | 46 | demonstration["problem_id"].append(data["problem_id"][i]) 47 | demonstration["description"].append(data["problem_description"][i].strip()) 48 | demonstration["instruction"].append(instruction) 49 | demonstration["starter_code"].append(data["starter_code"][i]) 50 | demonstration["code"].append(code) 51 | demonstration["code_cc"].append(get_avg_cc(data[args.code_type][i])) 52 | 53 | return demonstration 54 | 55 | 56 | def extract_solution(args, generation): 57 | if args.num_icl_shot > 0: 58 | start_index = generation.find("```") 59 | if start_index == -1: 60 | solution = "" 61 | else: 62 | end_index = generation.find("```", start_index + len("```")) 63 | if start_index < end_index: 64 | solution = generation[start_index + len("```") : end_index] 65 | else: 66 | solution = "" 67 | 68 | return solution 69 | 70 | 71 | def make_prompt(args, demonstration, test_data): 72 | if test_data["starter_code"] == "": 73 | question_guide = "read from and write to standard IO" 74 | else: 75 | question_guide = "use the provided function signature" 76 | 77 | if "sc" in args.code_type: 78 | instruction = ( 79 | "Write a python code to solve the following coding problem " 80 | "that obeys the constraints and passes the example test cases. " 81 | f"The output code needs to {question_guide}. " 82 | "Please wrap your code answer using ```:" 83 | ) 84 | elif "mc" in args.code_type: 85 | instruction = ( 86 | "Write a python code to solve the following coding problem " 87 | "that obeys the constraints and passes the example test cases. " 88 | f"The output code needs to {question_guide}. " 89 | "Ensure modularity of the python code by dividing the code into smaller, " 90 | "useful functions to solve the given problem. " 91 | "Please wrap your code answer using ```:" 92 | ) 93 | 94 | # instruction of CodeLlama for APPS 95 | if "meta-llama/CodeLlama" in args.model: 96 | # make zero-shot or few-shot prompt 97 | prompt = "" 98 | if args.num_icl_shot == 0: 99 | assert () # not implemented yet 100 | elif args.num_icl_shot > 0: 101 | for i in range(args.num_icl_shot): 102 | prompt += "Q: " + demonstration["instruction"][i] + "\n" 103 | prompt += demonstration["description"][i] + "\n" 104 | prompt += demonstration["starter_code"][i] + "\n" 105 | prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n" 106 | prompt += "Q: " + instruction + "\n" 107 | prompt += test_data["question"] + "\n" 108 | prompt += test_data["starter_code"] + "\n" 109 | prompt += "A: " 110 | 111 | # instruction of DeepseekCoder for APPS 112 | elif "deepseek-ai/deepseek-coder" in args.model: 113 | # make zero-shot or few-shot prompt 114 | prompt = "" 115 | if args.num_icl_shot == 0: 116 | assert () # not implemented yet 117 | elif args.num_icl_shot > 0: 118 | for i in range(args.num_icl_shot): 119 | prompt += demonstration["instruction"][i] + "\n" 120 | prompt += "### Instruction:\n" + demonstration["description"][i] + "\n" 121 | prompt += demonstration["starter_code"][i] + "\n" 122 | prompt += ( 123 | "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n" 124 | ) 125 | prompt += instruction + "\n" 126 | prompt += "### Instruction:\n" + test_data["question"] + "\n" 127 | prompt += test_data["starter_code"] + "\n" 128 | prompt += "### Response:\n" 129 | 130 | return prompt 131 | 132 | 133 | def main(): 134 | parser = argparse.ArgumentParser() 135 | parser.add_argument("--seed", type=int, default=42) 136 | parser.add_argument("--model", type=str, default="meta-llama/CodeLlama-7b-hf") 137 | parser.add_argument("--num_gpu", type=int, default=1) 138 | parser.add_argument("--dtype", type=str, default="float16") 139 | parser.add_argument("--num_icl_shot", type=int, default=2) 140 | parser.add_argument( 141 | "--num_gen", 142 | type=int, 143 | default=1, 144 | help="number of solutions generated per problem", 145 | ) 146 | parser.add_argument("--code_type", type=str, default="sc") 147 | parser.add_argument( 148 | "--temperature", 149 | type=float, 150 | default=0, 151 | help="0 means greedy decoding for vllm", 152 | ) 153 | parser.add_argument("--max_new_token", type=int, default=1024) 154 | parser.add_argument("--top_p", type=float, default=0.95) 155 | parser.add_argument( 156 | "--modify", 157 | type=str, 158 | default="original", 159 | help="modification method of the demonstration code", 160 | ) 161 | parser.add_argument( 162 | "--swap_space", 163 | type=int, 164 | default=4, 165 | help="The size (GiB) of CPU memory per GPU to use as swap space", 166 | ) 167 | 168 | args = parser.parse_args() 169 | 170 | set_seed(args.seed) 171 | 172 | base_directory = os.path.dirname(__file__) 173 | if not os.path.exists(os.path.join(base_directory, "result")): 174 | os.makedirs(os.path.join(base_directory, "result")) 175 | file_name = f"{args.model.replace('/', '-')}_{args.code_type}_{args.modify}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 176 | 177 | data = Dataset.from_json( 178 | os.path.join( 179 | os.path.dirname(__file__), 180 | "data", 181 | f"2shot_demonstration_{args.seed}seed_reduced2.json", 182 | ) 183 | ) 184 | 185 | demonstration = get_transformed_demonstration(args, data) 186 | 187 | # load apps test dataset 188 | test_dataset = load_dataset("codeparrot/apps", split="test", trust_remote_code=True) 189 | # filtering for specific platforms 190 | words = ["codeforces", "atcoder", "codechef"] 191 | test_dataset = test_dataset.filter( 192 | lambda x: any(word in x["url"] for word in words) 193 | ) 194 | 195 | prompts = [] 196 | for test_data in test_dataset: 197 | prompt = make_prompt(args, demonstration, test_data) 198 | prompts.append(prompt) 199 | 200 | if os.path.exists(os.path.join(base_directory, "result", file_name)): 201 | results = read_jsonl_to_dict(os.path.join(base_directory, "result", file_name)) 202 | start_index = len(results) 203 | if not start_index == len(prompts): 204 | prompts = prompts[start_index:] 205 | else: 206 | print("All problems are already solved.") 207 | sys.exit() 208 | 209 | # load model 210 | # when initializing VLLM engine, random.seed() is called internally. 211 | model = LLM( 212 | model=args.model, 213 | tensor_parallel_size=args.num_gpu, 214 | dtype=args.dtype, 215 | max_model_len=8192, 216 | swap_space=args.swap_space, 217 | ) 218 | if "meta-llama/CodeLlama" in args.model: 219 | stop = ["Q:", "A:"] 220 | elif "deepseek-ai/deepseek-coder" in args.model: 221 | stop = ["### Instruction", "### Response"] 222 | 223 | sampling_params = SamplingParams( 224 | n=args.num_gen, 225 | temperature=args.temperature, 226 | top_p=args.top_p, 227 | max_tokens=args.max_new_token, 228 | stop=stop, 229 | ) 230 | 231 | # inference using vllm 232 | generations = [] 233 | solutions = [] 234 | for idx, prompt in enumerate(tqdm(prompts)): 235 | outputs = model.generate( 236 | prompt, sampling_params=sampling_params, use_tqdm=False 237 | ) 238 | 239 | for output in outputs: 240 | # for each input in the prompts, args.gen_num number of outputs are generated 241 | generations_ = [outs.text for outs in output.outputs] 242 | assert len(generations_) == args.num_gen 243 | # extract solution code from generated code 244 | solutions_ = [ 245 | extract_solution(args, generation) for generation in generations_ 246 | ] 247 | # save generated solutions (list) 248 | generations.append(generations_) 249 | solutions.append(solutions_) 250 | 251 | # save generated solutions 252 | result = [] 253 | 254 | result.append( 255 | { 256 | "problem_id": test_dataset[idx]["problem_id"], 257 | "description": test_dataset[idx]["question"], 258 | "difficulty": test_dataset[idx]["difficulty"], 259 | "starter_code": test_dataset[idx]["starter_code"], 260 | "generated_solutions": generations_, 261 | "extracted_solutions": solutions_, 262 | "prompt": prompt, 263 | "demonstration": demonstration, 264 | } 265 | ) 266 | 267 | write_dict_to_jsonl(result, os.path.join(base_directory, "result", file_name)) 268 | 269 | print("program ends.") 270 | 271 | 272 | if __name__ == "__main__": 273 | main() 274 | -------------------------------------------------------------------------------- /codecontests/sc2mc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | from utils.utils_evaluate import verify_code_official 5 | 6 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl 7 | 8 | from openai import OpenAI 9 | 10 | import multiprocessing 11 | 12 | 13 | problem_description = '''\ 14 | QUESTION: 15 | Given a permutation $p$ of length $n$, find its subsequence $s_1$, $s_2$, $\ldots$, $s_k$ of length at least $2$ such that: $|s_1-s_2|+|s_2-s_3|+\ldots+|s_{{k-1}}-s_k|$ is as big as possible over all subsequences of $p$ with length at least $2$. Among all such subsequences, choose the one whose length, $k$, is as small as possible. 16 | 17 | If multiple subsequences satisfy these conditions, you are allowed to find any of them. 18 | 19 | A sequence $a$ is a subsequence of an array $b$ if $a$ can be obtained from $b$ by deleting some (possibly, zero or all) elements. 20 | 21 | A permutation of length $n$ is an array of length $n$ in which every element from $1$ to $n$ occurs exactly once. 22 | 23 | 24 | -----Input----- 25 | 26 | The first line contains an integer $t$ ($1 \le t \le 2 \cdot 10^4$) — the number of test cases. The description of the test cases follows. 27 | 28 | The first line of each test case contains an integer $n$ ($2 \le n \le 10^5$) — the length of the permutation $p$. 29 | 30 | The second line of each test case contains $n$ integers $p_1$, $p_2$, $\ldots$, $p_{{n}}$ ($1 \le p_i \le n$, $p_i$ are distinct) — the elements of the permutation $p$. 31 | 32 | The sum of $n$ across the test cases doesn't exceed $10^5$. 33 | 34 | 35 | -----Output----- 36 | 37 | For each test case, the first line should contain the length of the found subsequence, $k$. The second line should contain $s_1$, $s_2$, $\ldots$, $s_k$ — its elements. 38 | 39 | If multiple subsequences satisfy these conditions, you are allowed to find any of them. 40 | 41 | 42 | -----Example----- 43 | Input 44 | 2 45 | 3 46 | 3 2 1 47 | 4 48 | 1 3 4 2 49 | 50 | Output 51 | 2 52 | 3 1 53 | 3 54 | 1 4 2 55 | 56 | 57 | 58 | -----Note----- 59 | 60 | In the first test case, there are $4$ subsequences of length at least $2$: $[3,2]$ which gives us $|3-2|=1$. $[3,1]$ which gives us $|3-1|=2$. $[2,1]$ which gives us $|2-1|=1$. $[3,2,1]$ which gives us $|3-2|+|2-1|=2$. 61 | 62 | So the answer is either $[3,1]$ or $[3,2,1]$. Since we want the subsequence to be as short as possible, the answer is $[3,1]$.\ 63 | ''' 64 | 65 | sc = '''\ 66 | ANSWER: 67 | ```python 68 | import sys 69 | for _ in range(int(input())): 70 | n = int(input()) 71 | data = list(map(int, input().split())) 72 | ans = [data[0]] 73 | for i in range(1, n - 1): 74 | if data[i - 1] < data[i] > data[i + 1] or data[i - 1] > data[i] < data[i + 1]: 75 | ans += [data[i]] 76 | print(len(ans) + 1) 77 | print(*ans, data[-1]) 78 | ```\ 79 | ''' 80 | 81 | mc = '''\ 82 | ```python 83 | import sys 84 | 85 | def ii(): 86 | return sys.stdin.readline().strip() 87 | 88 | def idata(): 89 | return [int(x) for x in ii().split()] 90 | 91 | def solve_of_problem(): 92 | n = int(ii()) 93 | data = idata() 94 | ans = [data[0]] 95 | for i in range(1, n - 1): 96 | if data[i - 1] < data[i] > data[i + 1] or data[i - 1] > data[i] < data[i + 1]: 97 | ans += [data[i]] 98 | print(len(ans) + 1) 99 | print(*ans, data[-1]) 100 | return 101 | 102 | if __name__ == '__main__': 103 | for ______ in range(int(ii())): 104 | solve_of_problem() 105 | ```\ 106 | ''' 107 | 108 | sc2mc_instruction = '''\ 109 | Refactor the above python program following the question. Follow the guidelines 110 | * make the program more modular with smaller and meaningful helper functions 111 | * good descriptive names for the helper functions 112 | * have an entry function called ‘main()’ 113 | * 'main()' is called inside 'if __name__ == '__main__'' 114 | 115 | Do not change the original semantics of the program significantly and no need to perform optimizations. \ 116 | Enclose the program within backticks as shown above\ 117 | ''' 118 | 119 | mc2sc_instruction = '''\ 120 | Refactor the above program. Follow the guidelines 121 | * make the program monolithic without helper functions 122 | * transform the program with multiple functions into a single piece of code 123 | * do not copy the given code exactly as it is 124 | * eliminate any modular structures such as separate functions or classes, merging them into a continuous, unified script 125 | 126 | Do not change the original semantics of the program significantly and no need to perform optimizations. \ 127 | Enclose the program within backticks as shown above\ 128 | ''' 129 | 130 | sc2mc_demonstration = { 131 | 'problem_description': problem_description, 132 | 'sc': sc, 133 | 'mc': mc, 134 | 'instruction': sc2mc_instruction 135 | } 136 | 137 | mc2sc_demonstration = { 138 | 'problem_description': problem_description, 139 | 'sc': sc, 140 | 'mc': mc, 141 | 'instruction': mc2sc_instruction 142 | } 143 | 144 | def make_gpt_chat_messsage(role, content): 145 | return {'role': role, 'content': content} 146 | 147 | 148 | def make_sc2mc_prompt(demonstration, input, shot): 149 | problem_description = demonstration['problem_description'] 150 | sc = demonstration['sc'] 151 | mc = demonstration['mc'] 152 | instruction = demonstration['instruction'] 153 | 154 | input_problem_description = input['problem_description'] 155 | input_code = input['code'] 156 | 157 | messages = [] 158 | 159 | # zero-shot prompt for sc -> mc 160 | if shot == 0: 161 | messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant.")) 162 | messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction)) 163 | # 1 shot prompt for sc -> mc 164 | else: 165 | messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant.")) 166 | messages.append(make_gpt_chat_messsage('user', problem_description + '\n' + sc + '\n' + instruction)) 167 | messages.append(make_gpt_chat_messsage('assistant', mc)) 168 | messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction)) 169 | 170 | return messages 171 | 172 | 173 | def make_mc2sc_prompt(demonstration, input, shot): 174 | problem_description = demonstration['problem_description'] 175 | sc = demonstration['sc'] 176 | mc = demonstration['mc'] 177 | instruction = demonstration['instruction'] 178 | 179 | input_problem_description = input['problem_description'] 180 | input_code = input['code'] 181 | 182 | messages = [] 183 | 184 | # zero-shot prompt for mc -> sc 185 | if shot == 0: 186 | messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant.")) 187 | messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction)) 188 | # 1 shot prompt for sc -> mc 189 | else: 190 | messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant.")) 191 | messages.append(make_gpt_chat_messsage('user', problem_description + '\n' + mc + '\n' + instruction)) 192 | messages.append(make_gpt_chat_messsage('assistant', sc)) 193 | messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction)) 194 | 195 | return messages 196 | 197 | 198 | def check_correctness(code, tests): 199 | GLOBAL_TIMEOUT = 10 200 | 201 | def _temp_run(code, tests, result): 202 | try: 203 | flag, outcomes = verify_code_official(tests, code) 204 | result.append(flag) 205 | except Exception as e: 206 | pass 207 | 208 | manager = multiprocessing.Manager() 209 | result = manager.list() 210 | p = multiprocessing.Process(target=_temp_run, args=(code, tests, result)) 211 | p.start() 212 | p.join(timeout=GLOBAL_TIMEOUT + 1) 213 | if p.is_alive(): 214 | p.kill() 215 | if not result: 216 | result = [-1] 217 | if result[0] == True: 218 | return True 219 | else: 220 | return False 221 | 222 | def main(): 223 | # seeds = [27, 42, 101, 134, 169] 224 | # seeds = [42, 101, 134, 169] 225 | seeds = [101] 226 | code_type = 'monolithic' 227 | client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY')) 228 | 229 | for seed in seeds: 230 | base_directory = os.getcwd() 231 | file_name = f"{code_type}_2shot_demonstration_{seed}seed.jsonl" 232 | data = read_jsonl_to_dict(os.path.join(base_directory, 'data', file_name))[0] 233 | 234 | transformed_code = [] 235 | passed = [] 236 | # 2 examples are in demonstration 237 | for i in range(2): 238 | problem_description = data['problem_description'][i] 239 | input_code = data['code'][i] 240 | input = {'problem_description': problem_description, 'code': input_code} 241 | messages = make_sc2mc_prompt(sc2mc_demonstration, input, shot=1) 242 | 243 | completion = client.chat.completions.create( 244 | model="gpt-3.5-turbo", 245 | messages=messages, 246 | max_tokens=1024, 247 | stop=["\n\n\n\n", "####", "----"], 248 | temperature=0, 249 | ) 250 | response = completion.choices[0].message.content 251 | 252 | start_index = response.find('```python') 253 | if start_index != -1: 254 | end_index = response.find('```', start_index + len('```python')) 255 | if end_index != -1: 256 | response = response[start_index + len('```python'): end_index] 257 | else: 258 | response = response[start_index + len('```python'):] 259 | transformed_code.append(response) 260 | 261 | ## correctness check 262 | tests = {'inputs': [], 'outputs': []} 263 | tests['inputs'].extend(data['public_tests'][i]['input']) 264 | tests['inputs'].extend(data['private_tests'][i]['input']) 265 | tests['outputs'].extend(data['public_tests'][i]['output']) 266 | tests['outputs'].extend(data['private_tests'][i]['output']) 267 | 268 | if check_correctness(response, tests) == True: 269 | print('pass') 270 | passed.append(True) 271 | else: 272 | print('not passed') 273 | passed.append(False) 274 | 275 | data['transformed_code'] = transformed_code 276 | data['passed'] = passed 277 | write_dict_to_jsonl([data], os.path.join(base_directory, 'data', file_name)) 278 | main() -------------------------------------------------------------------------------- /codecontests/icl_corr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import argparse 4 | from tqdm import tqdm 5 | import numpy as np 6 | from collections import defaultdict 7 | import torch 8 | from datasets import load_dataset 9 | from vllm import LLM, SamplingParams 10 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score 11 | 12 | 13 | def set_seed(seed): 14 | random.seed(seed) 15 | np.random.seed(seed) 16 | torch.manual_seed(seed) 17 | torch.cuda.manual_seed(seed) 18 | # When running on the CuDNN backend, two further options must be set 19 | torch.backends.cudnn.deterministic = True 20 | torch.backends.cudnn.benchmark = False 21 | # Set a fixed value for the hash seed 22 | os.environ["PYTHONHASHSEED"] = str(seed) 23 | 24 | 25 | def extract_solution(args, generation): 26 | if "meta-llama/CodeLlama" in args.model: 27 | if args.num_icl_shot == 0: 28 | assert () # not implemented yet 29 | elif args.num_icl_shot > 0: 30 | start_index = generation.find("```") 31 | if start_index == -1: 32 | solution = "" 33 | else: 34 | end_index = generation.find("```", start_index + len("```")) 35 | if start_index < end_index: 36 | solution = generation[start_index + len("```") : end_index] 37 | else: 38 | solution = "" 39 | 40 | elif "deepseek-ai/deepseek-coder" in args.model: 41 | if args.num_icl_shot == 0: 42 | assert () # not implemented yet 43 | elif args.num_icl_shot > 0: 44 | start_index = generation.find("```") 45 | if start_index == -1: 46 | solution = "" 47 | else: 48 | end_index = generation.find("```", start_index + len("```")) 49 | if start_index < end_index: 50 | solution = generation[start_index + len("```") : end_index] 51 | else: 52 | solution = "" 53 | 54 | return solution 55 | 56 | 57 | def make_prompt(args, demonstration, test_data): 58 | instruction = ( 59 | "Write a python code to solve the following coding problem " 60 | "that obeys the constraints and passes the example test cases. " 61 | "The output code needs to read from and write to standard IO. " 62 | "Please wrap your code answer using ```:" 63 | ) 64 | 65 | if "meta-llama/CodeLlama" in args.model: 66 | # make zero-shot or few-shot prompt 67 | prompt = "" 68 | if args.num_icl_shot == 0: 69 | assert () # not implemented yet 70 | elif args.num_icl_shot > 0: 71 | for i in range(args.num_icl_shot): 72 | prompt += "Q: " + instruction + "\n" 73 | prompt += demonstration["description"][i] + "\n" 74 | prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n" 75 | prompt += "Q: " + instruction + "\n" 76 | prompt += test_data["description"] + "\n" 77 | prompt += "A: " 78 | elif "deepseek-ai/deepseek-coder" in args.model: 79 | # make zero-shot or few-shot prompt 80 | prompt = "" 81 | if args.num_icl_shot == 0: 82 | assert () # not implemented yet 83 | elif args.num_icl_shot > 0: 84 | prompt += instruction + "\n" 85 | for i in range(args.num_icl_shot): 86 | prompt += "### Instruction:\n" + demonstration["description"][i] + "\n" 87 | prompt += ( 88 | "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n" 89 | ) 90 | prompt += "### Instruction:\n" + test_data["description"] + "\n" 91 | prompt += "### Response:\n" 92 | 93 | return prompt 94 | 95 | 96 | def main(): 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument("--seed", type=int, required=True, default=42) 99 | parser.add_argument( 100 | "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf" 101 | ) 102 | parser.add_argument("--num_gpu", type=int, required=True, default=1, help="total number of gpus used") 103 | parser.add_argument("--dtype", type=str, required=True, default="float16") 104 | parser.add_argument("--num_icl_shot", type=int, required=True, default=2) 105 | parser.add_argument( 106 | "--num_gen", 107 | type=int, 108 | required=True, 109 | default=1, 110 | help="number of solutions generated per problem", 111 | ) 112 | parser.add_argument( 113 | "--temperature", 114 | type=float, 115 | required=True, 116 | default=0, 117 | help="0 means greedy decoding for vllm", 118 | ) 119 | parser.add_argument("--max_new_token", type=int, required=True, default=1024) 120 | parser.add_argument("--top_p", type=float, required=True, default=0.95) 121 | parser.add_argument( 122 | "--swap_space", 123 | type=int, 124 | required=False, 125 | default=4, 126 | help="The size (GiB) of CPU memory per GPU to use as swap space", 127 | ) 128 | parser.add_argument( 129 | "--metric", 130 | type=str, 131 | required=True, 132 | default='style', 133 | help="code metric (e.g., style or modularity)", 134 | ) 135 | # additional arguments candidiates: 136 | # max_model_len 137 | # stop 138 | # start_token, end_token 139 | args = parser.parse_args() 140 | 141 | # this code is impelemented for only 1-shot ICL 142 | assert args.num_icl_shot == 1 143 | 144 | # load model 145 | # when initializing VLLM engine, random.seed() is called internally. 146 | # so, set_seed() should be called after initializing VLLM engine. 147 | model = LLM( 148 | model=args.model, 149 | tensor_parallel_size=args.num_gpu, 150 | dtype=args.dtype, 151 | max_model_len=8192, 152 | swap_space=args.swap_space, 153 | ) 154 | 155 | if "meta-llama/CodeLlama" in args.model: 156 | stop = ["Q:", "A:"] 157 | elif "deepseek-ai/deepseek-coder" in args.model: 158 | stop = ["### Instruction", "### Response"] 159 | 160 | sampling_params = SamplingParams( 161 | n=args.num_gen, 162 | temperature=args.temperature, 163 | top_p=args.top_p, 164 | max_tokens=args.max_new_token, 165 | stop=stop, 166 | ) 167 | 168 | # load code contest test dataset 169 | test_dataset = load_dataset( 170 | "deepmind/code_contests", 171 | split="test", 172 | cache_dir="/data/huggingface/datasets", 173 | ) 174 | 175 | # set seed 176 | set_seed(args.seed) 177 | 178 | base_directory = os.path.dirname(__file__) 179 | 180 | # demonstration pool constructed by style or modularity 181 | demonstration_dataset = read_jsonl_to_dict( 182 | os.path.join( 183 | base_directory, 184 | "data", 185 | f"{args.metric}_demonstration.jsonl", 186 | ) 187 | ) 188 | assert len(demonstration_dataset) == 100 189 | 190 | # iterate over codes in the demonstration 191 | # make 1-shot prompt using the code and estimate pass@k 192 | for code_idx, data in enumerate(demonstration_dataset): 193 | if data['var_len'] < 5: 194 | continue 195 | print(f'average variable length: {data["var_len"]}') 196 | file_name = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.metric}_{code_idx}code_icl_result.jsonl" 197 | if not os.path.exists(os.path.join(base_directory, "result", file_name)): 198 | print(file_name) 199 | description = data['description'] 200 | code = data['code'] 201 | score_style = data['score_style'] # 'score_pep8', 'score_var', 'score_style' 202 | score_modularity = data['score_modularity'] 203 | 204 | # make demonstration for each code (1-shot) 205 | demonstration = defaultdict(list) 206 | demonstration['description'].append(description.strip()) 207 | demonstration["code"].append(code.strip()) 208 | demonstration['score_style'].append(score_style) 209 | demonstration['score_modularity'].append(score_modularity) 210 | demonstration['var_len'].append(data['var_len']) 211 | 212 | # make prompt for each test data 213 | prompts = [] 214 | # test_dataset = list(test_dataset)[:1] # for test 215 | for test_data in test_dataset: 216 | prompt = make_prompt(args, demonstration, test_data) 217 | prompts.append(prompt) 218 | 219 | # inference using vllm 220 | generations = [] 221 | solutions = [] 222 | # generate solution code using vllm 223 | print(f'') 224 | 225 | outputs = model.generate( 226 | prompts, sampling_params=sampling_params, use_tqdm=True 227 | ) 228 | for output in outputs: 229 | # for each input in the prompts, args.gen_num number of outputs are generated 230 | generations_ = [outs.text for outs in output.outputs] 231 | assert len(generations_) == args.num_gen 232 | # extract solution code from generated code 233 | solutions_ = [ 234 | extract_solution(args, generation) for generation in generations_ 235 | ] 236 | # save generated solutions (list) 237 | generations.append(generations_) 238 | solutions.append(solutions_) 239 | 240 | # save generated solutions 241 | result = [] 242 | for i, test_data in enumerate(test_dataset): 243 | result.append( 244 | { 245 | "name": test_data["name"], 246 | "description": test_data["description"], 247 | "public_tests": test_data["public_tests"], 248 | "private_tests": test_data["private_tests"], 249 | "difficulty": test_data["difficulty"], 250 | "cf_rating": test_data["cf_rating"], # difficulty level 251 | "generated_solutions": generations[i], # list of generated solutions 252 | "extracted_solutions": solutions[i], 253 | "prompt": prompts[i], 254 | "demonstration": demonstration, # contains code and its metric scores 255 | } 256 | ) 257 | 258 | write_dict_to_jsonl(result, os.path.join(base_directory, "result", file_name)) 259 | 260 | print(f'program ends.') 261 | 262 | 263 | if __name__ == "__main__": 264 | main() 265 | -------------------------------------------------------------------------------- /codecontests/data/monolithic_2shot_demonstration_42seed.jsonl: -------------------------------------------------------------------------------- 1 | {"problem_description": ["Pasha loves to send strictly positive integers to his friends. Pasha cares about security, therefore when he wants to send an integer n, he encrypts it in the following way: he picks three integers a, b and c such that l \u2264 a,b,c \u2264 r, and then he computes the encrypted value m = n \u22c5 a + b - c.\n\nUnfortunately, an adversary intercepted the values l, r and m. Is it possible to recover the original values of a, b and c from this information? More formally, you are asked to find any values of a, b and c such that\n\n * a, b and c are integers, \n * l \u2264 a, b, c \u2264 r, \n * there exists a strictly positive integer n, such that n \u22c5 a + b - c = m. \n\nInput\n\nThe first line contains the only integer t (1 \u2264 t \u2264 20) \u2014 the number of test cases. The following t lines describe one test case each.\n\nEach test case consists of three integers l, r and m (1 \u2264 l \u2264 r \u2264 500 000, 1 \u2264 m \u2264 10^{10}). The numbers are such that the answer to the problem exists.\n\nOutput\n\nFor each test case output three integers a, b and c such that, l \u2264 a, b, c \u2264 r and there exists a strictly positive integer n such that n \u22c5 a + b - c = m. It is guaranteed that there is at least one possible solution, and you can output any possible combination if there are multiple solutions.\n\nExample\n\nInput\n\n\n2\n4 6 13\n2 3 1\n\n\nOutput\n\n\n4 6 5\n2 2 3\n\nNote\n\nIn the first example n = 3 is possible, then n \u22c5 4 + 6 - 5 = 13 = m. Other possible solutions include: a = 4, b = 5, c = 4 (when n = 3); a = 5, b = 4, c = 6 (when n = 3); a = 6, b = 6, c = 5 (when n = 2); a = 6, b = 5, c = 4 (when n = 2).\n\nIn the second example the only possible case is n = 1: in this case n \u22c5 2 + 2 - 3 = 1 = m. Note that, n = 0 is not possible, since in that case n is not a strictly positive integer.", "You are given three integers x, y and n. Your task is to find the maximum integer k such that 0 \u2264 k \u2264 n that k mod x = y, where mod is modulo operation. Many programming languages use percent operator % to implement it.\n\nIn other words, with given x, y and n you need to find the maximum possible integer from 0 to n that has the remainder y modulo x.\n\nYou have to answer t independent test cases. It is guaranteed that such k exists for each test case.\n\nInput\n\nThe first line of the input contains one integer t (1 \u2264 t \u2264 5 \u22c5 10^4) \u2014 the number of test cases. The next t lines contain test cases.\n\nThe only line of the test case contains three integers x, y and n (2 \u2264 x \u2264 10^9;~ 0 \u2264 y < x;~ y \u2264 n \u2264 10^9).\n\nIt can be shown that such k always exists under the given constraints.\n\nOutput\n\nFor each test case, print the answer \u2014 maximum non-negative integer k such that 0 \u2264 k \u2264 n and k mod x = y. It is guaranteed that the answer always exists.\n\nExample\n\nInput\n\n\n7\n7 5 12345\n5 0 4\n10 5 15\n17 8 54321\n499999993 9 1000000000\n10 5 187\n2 0 999999999\n\n\nOutput\n\n\n12339\n0\n15\n54306\n999999995\n185\n999999998\n\nNote\n\nIn the first test case of the example, the answer is 12339 = 7 \u22c5 1762 + 5 (thus, 12339 mod 7 = 5). It is obvious that there is no greater integer not exceeding 12345 which has the remainder 5 modulo 7."], "public_tests": [{"input": ["2\n4 6 13\n2 3 1\n"], "output": ["4 5 4\n2 2 3\n"]}, {"input": ["7\n7 5 12345\n5 0 4\n10 5 15\n17 8 54321\n499999993 9 1000000000\n10 5 187\n2 0 999999999\n"], "output": ["12339\n0\n15\n54306\n999999995\n185\n999999998\n"]}], "private_tests": [{"input": ["20\n10 12 43\n25 49 1\n5 7 39\n8 9 44\n16 17 50\n30 40 975\n601 801 1000\n100 102 909\n599 799 1000\n503 997 9\n194 383 5\n90000 100000 709999\n75000 100000 124999\n375000 499999 625001\n375000 500000 624999\n300000 400000 499999\n250000 500000 1\n70000 80000 2272770257\n70000 80000 9999953344\n90000 100000 9999955820\n", "20\n375000 500000 624999\n375000 499999 624997\n375003 499999 624995\n375002 499999 624995\n375001 499999 624996\n375002 499999 624996\n375001 499999 624997\n375000 499999 624991\n375000 499999 624995\n375000 499999 624994\n375000 499999 624993\n375000 499999 624998\n375000 499999 624996\n375000 499999 624992\n375000 499999 624988\n375000 499999 624986\n375000 499999 624982\n375000 499999 624990\n375000 499999 624991\n375000 499999 624989\n", "20\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n", "4\n4 6 12\n1 1 1\n2 2 2\n3 3 3\n", "20\n1 500000 10000000000\n500000 500000 10000000000\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n", "7\n375000 499999 624997\n375000 500000 624999\n375000 499999 624995\n375000 499999 624994\n375000 499999 624993\n375000 499999 624998\n375000 499999 624996\n", "20\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n", "1\n1 4 10000000000\n"], "output": ["11 10 11\n25 25 49\n5 5 6\n9 8 9\n17 16 17\n35 30 35\n800 801 601\n101 100 100\n599 599 797\n503 503 997\n194 194 383\n100000 99999 90000\n99999 100000 75000\n375000 375000 499999\n499999 500000 375000\n399999 400000 300000\n250000 250000 499999\n70007 70000 76998\n70009 77802 70000\n90003 90000 97501\n", "499999 500000 375000\n499998 499999 375000\n499999 499999 375003\n499998 499999 375002\n499998 499999 375001\n499999 499999 375002\n499999 499999 375001\n499992 499999 375000\n499996 499999 375000\n499995 499999 375000\n499994 499999 375000\n499999 499999 375000\n499997 499999 375000\n499993 499999 375000\n499989 499999 375000\n499987 499999 375000\n499983 499999 375000\n499991 499999 375000\n499992 499999 375000\n499990 499999 375000\n", "250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n", "4 4 4\n1 1 1\n2 2 2\n3 3 3\n", "1 1 1\n500000 500000 500000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n", "499998 499999 375000\n499999 500000 375000\n499996 499999 375000\n499995 499999 375000\n499994 499999 375000\n499999 499999 375000\n499997 499999 375000\n", "499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n", "1 1 1\n"]}, {"input": ["1\n31 2 104\n", "1\n43284 1 33424242\n", "1\n943643 1 23522222\n", "1\n33 6 100\n", "1\n1000000000 0 999999999\n", "1\n4452384 1 3573842\n"], "output": ["95\n", "33415249\n", "22647433\n", "72\n", "0\n", "1\n"]}], "transformed_sc": ["\nfor _ in range(int(input())):\n l, r, m = map(int, input().split())\n for a in range(l, r + 1):\n minn = m // a\n maxn = 0 - -m // a\n if l - r <= m - minn * a <= r - l and minn > 0:\n m -= minn * a\n for b in range(l, r + 1):\n if -r <= m - b <= -l:\n c = b - m\n print(a,b,c)\n break\n break\n if l - r <= m - maxn * a <= r - l:\n m -= maxn * a\n for b in range(l, r + 1):\n if -r <= m - b <= -l:\n c = b - m\n print(a,b,c)\n break\n break\n ", "test_cases = int(input())\nfor _ in range(test_cases):\n x, y, n = map(int, input().split())\n if n < x and y == 0:\n print(0)\n elif n < x and y == 1:\n print(1)\n elif n % x == y:\n print(n)\n else:\n if x == 2 and y == 1 and n == 1:\n print(1)\n elif n % x < y:\n print(n - ((n % x) + (x - y)))\n else:\n print(n - ((n % x) - y))"], "sc": ["for _ in range(int(input())):\n l,r,m=map(int,input().split())\n for a in range(l,r+1):\n minn=m//a\n maxn=0--m//a\n if l-r<=m-minn*a<=r-l and minn>0:\n m-=minn*a\n for b in range(l,r+1):\n if -r<=m-b<=-l:\n print(a,b,b-m)\n break\n break\n if l-r<=m-maxn*a<=r-l:\n m-=maxn*a\n for b in range(l,r+1):\n if -r<=m-b<=-l:\n print(a,b,b-m)\n break\n break", "n = int(input())\nfor i in range(n):\n x,y,n = map(int,input().split())\n if n < x and y == 0:\n print(0)\n elif n < x and y == 1:\n print(1)\n elif n%x == y:\n print(n)\n else:\n if x == 2 and y == 1 and n == 1:\n print(1)\n elif n%x < y:\n print(n-((n%x)+(x-y)))\n else:\n print(n - ((n%x) - y))"], "sc_cc": [10.0, 11.0], "transformed_mc": ["\ndef find_values(l, r, m):\n for a in range(l, r + 1):\n minn = m // a\n maxn = 0 - -m // a\n if l - r <= m - minn * a <= r - l and minn > 0:\n m -= minn * a\n for b in range(l, r + 1):\n if -r <= m - b <= -l:\n return a, b, b - m\n if l - r <= m - maxn * a <= r - l:\n m -= maxn * a\n for b in range(l, r + 1):\n if -r <= m - b <= -l:\n return a, b, b - m\n\ndef main():\n for _ in range(int(input())):\n l, r, m = map(int, input().split())\n a, b, c = find_values(l, r, m)\n print(a, b, c)\n\nif __name__ == '__main__':\n main()\n", "\ndef find_maximum_k(x, y, n):\n if n < x and y == 0:\n return 0\n elif n < x and y == 1:\n return 1\n elif n % x == y:\n return n\n else:\n if x == 2 and y == 1 and n == 1:\n return 1\n elif n % x < y:\n return n - ((n % x) + (x - y))\n else:\n return n - ((n % x) - y)\n\ndef main():\n test_cases = int(input())\n for _ in range(test_cases):\n x, y, n = map(int, input().split())\n print(find_maximum_k(x, y, n))\n\nif __name__ == '__main__':\n main()\n"]} 2 | -------------------------------------------------------------------------------- /codecontests/icl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import argparse 4 | from tqdm import tqdm 5 | import numpy as np 6 | from collections import defaultdict 7 | import torch 8 | from datasets import load_dataset 9 | from vllm import LLM, SamplingParams 10 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score 11 | from utils.utils import get_code_modularity_score 12 | 13 | 14 | def set_seed(seed): 15 | random.seed(seed) 16 | np.random.seed(seed) 17 | torch.manual_seed(seed) 18 | torch.cuda.manual_seed(seed) 19 | # When running on the CuDNN backend, two further options must be set 20 | torch.backends.cudnn.deterministic = True 21 | torch.backends.cudnn.benchmark = False 22 | # Set a fixed value for the hash seed 23 | os.environ["PYTHONHASHSEED"] = str(seed) 24 | 25 | 26 | def extract_solution(args, generation): 27 | if "CodeLlama" in args.model: 28 | start_index = generation.find("```") 29 | if start_index == -1: 30 | solution = "" 31 | else: 32 | end_index = generation.find("```", start_index + len("```")) 33 | if start_index < end_index: 34 | solution = generation[start_index + len("```") : end_index] 35 | else: 36 | solution = "" 37 | 38 | elif "deepseek" in args.model: 39 | start_index = generation.find("```") 40 | if start_index == -1: 41 | solution = "" 42 | else: 43 | end_index = generation.find("```", start_index + len("```")) 44 | if start_index < end_index: 45 | solution = generation[start_index + len("```") : end_index] 46 | else: 47 | solution = "" 48 | 49 | return solution 50 | 51 | 52 | def make_prompt(args, demonstration, test_data): 53 | if 'monolithic' in args.code_type: 54 | instruction = ( 55 | "Write a python code to solve the following coding problem " 56 | "that obeys the constraints and passes the example test cases. " 57 | "The output code needs to read from and write to standard IO. " 58 | "Please wrap your code answer using ```:" 59 | ) 60 | elif 'modular' in args.code_type: 61 | instruction = ( 62 | "Write a python code to solve the following coding problem " 63 | "that obeys the constraints and passes the example test cases. " 64 | "The output code needs to read from and write to standard IO. " 65 | "Ensure modularity of the python code by dividing the code into smaller, " 66 | "useful functions to solve the given problem. " 67 | "Please wrap your code answer using ```:" 68 | ) 69 | 70 | if "CodeLlama" in args.model: 71 | # make zero-shot or few-shot prompt 72 | prompt = "" 73 | for i in range(args.num_icl_shot): 74 | prompt += "Q: " + instruction + "\n" 75 | prompt += demonstration["description"][i] + "\n" 76 | prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n" 77 | prompt += "Q: " + instruction + "\n" 78 | prompt += test_data["description"] + "\n" 79 | prompt += "A: " 80 | elif "deepseek" in args.model: 81 | # make zero-shot or few-shot prompt 82 | prompt = "" 83 | prompt += instruction + "\n" 84 | for i in range(args.num_icl_shot): 85 | prompt += "### Instruction:\n" + demonstration["description"][i] + "\n" 86 | prompt += ( 87 | "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n" 88 | ) 89 | prompt += "### Instruction:\n" + test_data["description"] + "\n" 90 | prompt += "### Response:\n" 91 | 92 | return prompt 93 | 94 | 95 | def extract_demonstration(train_dataset, shot, code_type): 96 | if 'transformed' not in code_type: 97 | problem_index_with_both_sc_and_mc = [] 98 | for i, data in enumerate(train_dataset): 99 | num_sc = len(data['monolithic_codes']['monolithic_code']) 100 | num_mc = len(data['modular_codes']['modular_code']) 101 | if num_sc > 0 and num_mc > 0: 102 | problem_index_with_both_sc_and_mc.append(i) 103 | 104 | demonstration = defaultdict(list) 105 | for i in random.sample(problem_index_with_both_sc_and_mc, shot): 106 | data = train_dataset[i] 107 | # modularity check 108 | # print(f'problem {i}') 109 | # tmp = [] 110 | # for code in data['modular_codes']['modular_code']: 111 | # modularity = get_code_modularity_score(code) 112 | # tmp.append(modularity) 113 | # print(tmp) 114 | if code_type == 'monolithic': 115 | demonstration['description'].append(data['problem_description'].strip()) 116 | demonstration['code'].append(data['monolithic_codes']['monolithic_code'][0].strip()) # pick the first code 117 | # print(get_code_modularity_score(data['monolithic_codes']['monolithic_code'][0])) 118 | elif code_type == 'modular': 119 | demonstration['description'].append(data['problem_description'].strip()) 120 | demonstration['code'].append(data['modular_codes']['modular_code'][0].strip()) 121 | print(get_code_modularity_score(data['modular_codes']['modular_code'][0])) 122 | print(data['modular_codes']['modular_code'][0]) 123 | 124 | return demonstration 125 | 126 | else: 127 | if code_type == 'transformed_modular': 128 | key = 'transformed_mc' 129 | elif code_type == 'transformed_monolithic': 130 | key = 'transformed_sc' 131 | 132 | demonstration = defaultdict(list) 133 | for i in range(shot): 134 | demonstration['description'].append(dataset['problem_description'][i].strip()) 135 | demonstration['code'].append(dataset[key][i].strip()) 136 | 137 | return demonstration 138 | 139 | 140 | def main(): 141 | parser = argparse.ArgumentParser() 142 | parser.add_argument("--seed", type=int, required=True, default=42) 143 | parser.add_argument( 144 | "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf" 145 | ) 146 | parser.add_argument("--num_gpu", type=int, required=True, default=1, help="total number of gpus used") 147 | parser.add_argument("--dtype", type=str, required=True, default="float16") 148 | parser.add_argument("--num_icl_shot", type=int, required=True, default=2) 149 | parser.add_argument( 150 | "--num_gen", 151 | type=int, 152 | required=True, 153 | default=1, 154 | help="number of solutions generated per problem", 155 | ) 156 | parser.add_argument( 157 | "--temperature", 158 | type=float, 159 | required=True, 160 | default=0, 161 | help="0 means greedy decoding for vllm", 162 | ) 163 | parser.add_argument("--max_new_token", type=int, required=True, default=1024) 164 | parser.add_argument("--top_p", type=float, required=True, default=0.95) 165 | parser.add_argument( 166 | "--swap_space", 167 | type=int, 168 | required=False, 169 | default=4, 170 | help="The size (GiB) of CPU memory per GPU to use as swap space", 171 | ) 172 | parser.add_argument('--code_type', type=str, required=True, default='monolithic') 173 | # additional arguments candidiates: 174 | # max_model_len 175 | # stop 176 | # start_token, end_token 177 | args = parser.parse_args() 178 | 179 | # load model 180 | # when initializing VLLM engine, random.seed() is called internally. 181 | # so, set_seed() should be called after initializing VLLM engine. 182 | model = LLM( 183 | model=args.model, 184 | tensor_parallel_size=args.num_gpu, 185 | dtype=args.dtype, 186 | max_model_len=8192, 187 | swap_space=args.swap_space, 188 | ) 189 | 190 | if "CodeLlama" in args.model: 191 | stop = ["Q:", "A:"] 192 | elif "deepseek" in args.model: 193 | stop = ["### Instruction", "### Response"] 194 | 195 | sampling_params = SamplingParams( 196 | n=args.num_gen, 197 | temperature=args.temperature, 198 | top_p=args.top_p, 199 | max_tokens=args.max_new_token, 200 | stop=stop, 201 | ) 202 | 203 | # load code contest test dataset 204 | test_dataset = load_dataset( 205 | "deepmind/code_contests", 206 | split="test", 207 | ) 208 | 209 | # set seed 210 | set_seed(args.seed) 211 | 212 | base_directory = os.path.dirname(__file__) 213 | 214 | # monolithic(sc) or modular(mc) demonstration 215 | if 'transformed' not in args.code_type: 216 | dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', 'my_code_contests_divided_train.jsonl')) 217 | # transformed monolithic(tsc) or transformed modular(tmc) demonstration 218 | else: 219 | dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', f'monolithic_2shot_demonstration_{args.seed}seed.jsonl'))[0] 220 | 221 | demonstration = extract_demonstration(dataset, args.num_icl_shot, args.code_type) 222 | 223 | file_name = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 224 | 225 | if os.path.exists(os.path.join(base_directory, "result", file_name)): 226 | print(f'{file_name} already exists.') 227 | return 228 | 229 | # make prompt for each test data 230 | prompts = [] 231 | # test_dataset = list(test_dataset)[:5] # for test 232 | for test_data in test_dataset: 233 | prompt = make_prompt(args, demonstration, test_data) 234 | prompts.append(prompt) 235 | 236 | # inference using vllm 237 | generations = [] 238 | solutions = [] 239 | 240 | # generate solution code using vllm 241 | outputs = model.generate( 242 | prompts, sampling_params=sampling_params, use_tqdm=True 243 | ) 244 | for output in outputs: 245 | # for each input in the prompts, args.gen_num number of outputs are generated 246 | generations_ = [outs.text for outs in output.outputs] 247 | assert len(generations_) == args.num_gen 248 | # extract solution code from generated code 249 | solutions_ = [ 250 | extract_solution(args, generation) for generation in generations_ 251 | ] 252 | # save generated solutions (list) 253 | generations.append(generations_) 254 | solutions.append(solutions_) 255 | 256 | # save generated solutions 257 | result = [] 258 | for i, test_data in enumerate(test_dataset): 259 | result.append( 260 | { 261 | "name": test_data["name"], 262 | "description": test_data["description"], 263 | "public_tests": test_data["public_tests"], 264 | "private_tests": test_data["private_tests"], 265 | "difficulty": test_data["difficulty"], 266 | "cf_rating": test_data["cf_rating"], # difficulty level 267 | "generated_solutions": generations[i], # list of generated solutions 268 | "extracted_solutions": solutions[i], 269 | "prompt": prompts[i], 270 | "demonstration": demonstration, # contains code and its description 271 | } 272 | ) 273 | 274 | write_dict_to_jsonl(result, os.path.join(base_directory, "result", file_name)) 275 | print(f'program ends.') 276 | 277 | 278 | if __name__ == "__main__": 279 | main() 280 | -------------------------------------------------------------------------------- /apps/data/2shot_demonstration_27seed.json: -------------------------------------------------------------------------------- 1 | {"problem_id":1596,"problem_description":"During quarantine chef\u2019s friend invented a game. In this game there are two players, player 1 and Player 2. In center of garden there is one finish circle and both players are at different distances respectively $X$ and $Y$ from finish circle.\nBetween finish circle and Player 1 there are $X$ number of circles and between finish circle and Player 2 there are $Y$ number of circles. Both player wants to reach finish circle with minimum number of jumps. Player can jump one circle to another circle.\nBoth players can skip $2^0-1$ or $2^1- 1$ or \u2026. or $2^N-1$ circles per jump. A player cannot skip same number of circles in a match more than once. If both players uses optimal way to reach finish circle what will be the difference of minimum jumps needed to reach finish circle by both players. \nIf both players reach finish circle with same number of jumps answer will be $0$ $0$.\n\n-----Input:-----\n- The first line of the input contains a single integer $T$ denoting the number of test cases. The \ndescription of $T$ test cases follows.\n- The first line of each test case contains 2 space separated integers $X$ and $Y$.\n\n-----Output:-----\nFor each test case, print a single line containing 2 space-separated integers which player win and what is the difference between number of minimum jump required by both players to reach finish circle.\n\n-----Constraints-----\n- $1 \\leq T \\leq 10^5$\n- $1 \\leq X,Y \\leq 2*10^7$\n\n-----Sample Input:-----\n2\n4 5\n3 5\n\n-----Sample Output:-----\n0 0\n1 1\n\n-----Explanation:-----\nTest Case 1:\n\nTest Case 2:","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"import math\n\nfor i in range(int(input())):\n p,q=list(map(int,input().split()))\n c=0\n h=0\n \n while(q>=0):\n if(q==0):\n h+=1\n break\n \n d=int(math.log2(q+1))\n if(d==0):\n h+=1\n break\n y=(2**d)-1\n q-=y+1\n if(q==-1):\n h+=1\n break\n h+=1\n \n while(p>=0):\n if(p==0):\n c+=1\n break\n else:\n rem=int(math.log2(p+1))\n \n if(rem==0):\n c+=1\n break\n \n y=(2**rem)-1\n p-=y+1\n if(p==-1):\n c+=1\n break\n c+=1\n\n if(c==h):\n print(0,0)\n if(ch):\n print(2,c-h)","sc_cc":13.0,"mc":"takeArr = lambda: list(map(int,input().split()))\ntakeList = lambda: list(map(int,input().split()))\nimport sys\nsys.setrecursionlimit(10**6)\n\n\n\nfrom math import floor,ceil,log2 \ndef powOfPositive(n) : \n pos = floor(log2(n)); \n return 2**pos; \ndef powOfNegative(n) : \n pos = ceil(log2(n)); \n return (-1 * pow(2, pos)); \ndef highestPowerOf2(n) : \n if (n > 0) : \n return powOfPositive(n); \n else : \n n = -n; \n return powOfNegative(n); \ndef main(t):\n x,y = takeArr()\n a,b = x+1,y+1\n sa = sb = 0\n while a:\n a -= highestPowerOf2(a)\n sa += 1\n while b:\n b -= highestPowerOf2(b)\n sb += 1\n \n winner = 2 if sa>sb else 1 if sb>sa else 0\n score = abs(sa-sb) if winner else 0\n print(winner,score)\n if t>1:\n main(t-1)\nmain(int(input()))","mc_cc":2.4,"transformed_mc":["\nimport math\n\ndef calculate_jumps(distance):\n jumps = 0\n while distance >= 0:\n if distance == 0:\n jumps += 1\n break\n\n power = int(math.log2(distance + 1))\n if power == 0:\n jumps += 1\n break\n step = (2 ** power) - 1\n distance -= step + 1\n if distance == -1:\n jumps += 1\n break\n jumps += 1\n\n return jumps\n\ndef determine_winner_and_difference(player1_distance, player2_distance):\n player1_jumps = calculate_jumps(player1_distance)\n player2_jumps = calculate_jumps(player2_distance)\n\n if player1_jumps == player2_jumps:\n return 0, 0\n elif player1_jumps < player2_jumps:\n return 1, player2_jumps - player1_jumps\n else:\n return 2, player1_jumps - player2_jumps\n\ndef main():\n for _ in range(int(input())):\n player1_distance, player2_distance = map(int, input().split())\n winner, difference = determine_winner_and_difference(player1_distance, player2_distance)\n print(winner, difference)\n\nif __name__ == '__main__':\n main()\n"],"transformed_sc":["import math\n\nfor _ in range(int(input())):\n player1_distance, player2_distance = map(int, input().split())\n \n jumps = 0\n distance = player1_distance\n while distance >= 0:\n if distance == 0:\n jumps += 1\n break\n power = int(math.log2(distance + 1))\n if power == 0:\n jumps += 1\n break\n step = (2 ** power) - 1\n distance -= step + 1\n if distance == -1:\n jumps += 1\n break\n jumps += 1\n player1_jumps = jumps\n \n jumps = 0\n distance = player2_distance\n while distance >= 0:\n if distance == 0:\n jumps += 1\n break\n power = int(math.log2(distance + 1))\n if power == 0:\n jumps += 1\n break\n step = (2 ** power) - 1\n distance -= step + 1\n if distance == -1:\n jumps += 1\n break\n jumps += 1\n player2_jumps = jumps\n \n if player1_jumps == player2_jumps:\n winner = 0\n difference = 0\n elif player1_jumps < player2_jumps:\n winner = 1\n difference = player2_jumps - player1_jumps\n else:\n winner = 2\n difference = player1_jumps - player2_jumps\n \n print(winner, difference)"]} 2 | {"problem_id":2348,"problem_description":"N hotels are located on a straight line. The coordinate of the i-th hotel (1 \\leq i \\leq N) is x_i.\nTak the traveler has the following two personal principles:\n - He never travels a distance of more than L in a single day.\n - He never sleeps in the open. That is, he must stay at a hotel at the end of a day.\nYou are given Q queries. The j-th (1 \\leq j \\leq Q) query is described by two distinct integers a_j and b_j.\nFor each query, find the minimum number of days that Tak needs to travel from the a_j-th hotel to the b_j-th hotel following his principles.\nIt is guaranteed that he can always travel from the a_j-th hotel to the b_j-th hotel, in any given input.\n\n-----Constraints-----\n - 2 \\leq N \\leq 10^5\n - 1 \\leq L \\leq 10^9\n - 1 \\leq Q \\leq 10^5\n - 1 \\leq x_i < x_2 < ... < x_N \\leq 10^9\n - x_{i+1} - x_i \\leq L\n - 1 \\leq a_j,b_j \\leq N\n - a_j \\neq b_j\n - N,\\,L,\\,Q,\\,x_i,\\,a_j,\\,b_j are integers.\n\n-----Partial Score-----\n - 200 points will be awarded for passing the test set satisfying N \\leq 10^3 and Q \\leq 10^3.\n\n-----Input-----\nThe input is given from Standard Input in the following format:\nN\nx_1 x_2 ... x_N\nL\nQ\na_1 b_1\na_2 b_2\n:\na_Q b_Q\n\n-----Output-----\nPrint Q lines.\nThe j-th line (1 \\leq j \\leq Q) should contain the minimum number of days that Tak needs to travel from the a_j-th hotel to the b_j-th hotel.\n\n-----Sample Input-----\n9\n1 3 6 13 15 18 19 29 31\n10\n4\n1 8\n7 3\n6 7\n8 5\n\n-----Sample Output-----\n4\n2\n1\n2\n\nFor the 1-st query, he can travel from the 1-st hotel to the 8-th hotel in 4 days, as follows:\n - Day 1: Travel from the 1-st hotel to the 2-nd hotel. The distance traveled is 2.\n - Day 2: Travel from the 2-nd hotel to the 4-th hotel. The distance traveled is 10.\n - Day 3: Travel from the 4-th hotel to the 7-th hotel. The distance traveled is 6.\n - Day 4: Travel from the 7-th hotel to the 8-th hotel. The distance traveled is 10.","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"import bisect\nimport sys\ninput = sys.stdin.readline\nn = int(input())\na = list(map(int,input().split()))\nd = int(input())\ngraph = [[0 for i in range(n+1)] for j in range(18)]\nfor i in range(n):\n x = bisect.bisect_right(a,a[i]+d)\n graph[0][i+1] = x\nfor j in range(1,18):\n for i in range(n):\n t = graph[j-1][i+1]\n graph[j][i+1] = graph[j-1][t]\nq = int(input())\nfor _ in range(q):\n x,y = map(int,input().split())\n x,y = min(x,y),max(x,y)\n ans = 0\n for j in range(18)[::-1]:\n if graph[j][x] < y:\n ans += 2**j\n x = graph[j][x]\n if j == 0 and x < y:\n ans += 1\n print(ans)","sc_cc":11.0,"mc":"def reachN(i0, n):\n if n == 0:\n return i0\n maxbit = ceil(log2(n))\n kL = []\n for i in range(maxbit+1):\n if n>>i & 1:\n kL.append(i)\n \n i = i0\n for k in kL[::-1]:\n i = dp[k][i] \n return i \n\ndef fun(n):\n return reachN(start, n) >= goal\n \ndef binMin(l, r):\n if r-l == 1:\n return r\n m = (l+r) \/\/ 2\n if fun(m):\n r = m\n else:\n l = m\n return binMin(l, r)\n\nimport bisect\nfrom math import log2, ceil\n\nN = int(input())\nxL = [-float('inf')] + [int(i) for i in input().split()] \nLmax = int(input())\n\nkmax = ceil(log2(len(xL)-2)) \ndp = [[len(xL)-1]*(N+1) for _ in range(kmax+1)]\nfor i in range(1, N+1):\n dp[0][i] = bisect.bisect_right(xL, xL[i]+Lmax) - 1\n\nfor k in range(1, kmax+1):\n for i in range(1, N):\n dp[k][i] = dp[k-1][dp[k-1][i]]\n\n\nQ = int(input())\nfor _ in range(Q):\n start, goal = list(map(int, input().split()))\n if start > goal:\n start, goal = goal, start\n ans = binMin(0, goal-start)\n print(ans)","mc_cc":4.25,"transformed_mc":["\nimport bisect\nimport sys\n\ndef hotel_distances(n, a, d):\n graph = [[0 for i in range(n+1)] for j in range(18)]\n for i in range(n):\n x = bisect.bisect_right(a, a[i] + d)\n graph[0][i+1] = x\n for j in range(1, 18):\n for i in range(n):\n t = graph[j-1][i+1]\n graph[j][i+1] = graph[j-1][t]\n return graph\n\ndef calculate_days_to_travel(graph, x, y):\n x, y = min(x, y), max(x, y)\n days = 0\n for j in range(18)[::-1]:\n if graph[j][x] < y:\n days += 2**j\n x = graph[j][x]\n if j == 0 and x < y:\n days += 1\n return days\n\ndef main():\n n = int(input())\n a = list(map(int, input().split()))\n d = int(input())\n graph = hotel_distances(n, a, d)\n q = int(input())\n for _ in range(q):\n x, y = map(int, input().split())\n days = calculate_days_to_travel(graph, x, y)\n print(days)\n\nif __name__ == '__main__':\n main()\n"],"transformed_sc":["import bisect\nimport sys\n\ndef hotel_distances(n, a, d):\n graph = [[0 for i in range(n+1)] for j in range(18)]\n for i in range(n):\n x = bisect.bisect_right(a, a[i] + d)\n graph[0][i+1] = x\n for j in range(1, 18):\n for i in range(n):\n t = graph[j-1][i+1]\n graph[j][i+1] = graph[j-1][t]\n return graph\n\ndef calculate_days_to_travel(graph, x, y):\n x, y = min(x, y), max(x, y)\n days = 0\n for j in range(18)[::-1]:\n if graph[j][x] < y:\n days += 2**j\n x = graph[j][x]\n if j == 0 and x < y:\n days += 1\n return days\n\nn = int(input())\na = list(map(int, input().split()))\nd = int(input())\ngraph = [[0 for i in range(n+1)] for j in range(18)]\nfor i in range(n):\n x = bisect.bisect_right(a, a[i] + d)\n graph[0][i+1] = x\nfor j in range(1, 18):\n for i in range(n):\n t = graph[j-1][i+1]\n graph[j][i+1] = graph[j-1][t]\n\nq = int(input())\nfor _ in range(q):\n x, y = map(int, input().split())\n x, y = min(x, y), max(x, y)\n days = 0\n for j in range(18)[::-1]:\n if graph[j][x] < y:\n days += 2**j\n x = graph[j][x]\n if j == 0 and x < y:\n days += 1\n print(days)"]} 3 | -------------------------------------------------------------------------------- /codecontests/icl_ft.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import argparse 4 | from tqdm import tqdm 5 | import numpy as np 6 | from collections import defaultdict 7 | import torch 8 | from datasets import load_dataset 9 | from vllm import LLM, SamplingParams 10 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score 11 | from utils.utils import get_code_modularity_score 12 | 13 | 14 | def set_seed(seed): 15 | random.seed(seed) 16 | np.random.seed(seed) 17 | torch.manual_seed(seed) 18 | torch.cuda.manual_seed(seed) 19 | # When running on the CuDNN backend, two further options must be set 20 | torch.backends.cudnn.deterministic = True 21 | torch.backends.cudnn.benchmark = False 22 | # Set a fixed value for the hash seed 23 | os.environ["PYTHONHASHSEED"] = str(seed) 24 | 25 | 26 | def extract_solution(args, generation): 27 | if "CodeLlama" in args.model: 28 | # start_index = generation.find("```") 29 | # if start_index == -1: 30 | # solution = "" 31 | # else: 32 | # end_index = generation.find("```", start_index + len("```")) 33 | # if start_index < end_index: 34 | # solution = generation[start_index + len("```") : end_index] 35 | # else: 36 | # solution = "" 37 | idx = generation.find('```') 38 | if idx != -1: 39 | solution = generation[:idx] 40 | else: 41 | solution = generation.strip() 42 | 43 | elif "deepseek" in args.model: 44 | idx = generation.find('```') 45 | if idx != -1: 46 | solution = generation[:idx] 47 | else: 48 | solution = generation.strip() 49 | 50 | return solution 51 | 52 | 53 | def make_prompt(args, demonstration, test_data): 54 | instruction = ( 55 | "Write a python code to solve the following coding problem " 56 | "that obeys the constraints and passes the example test cases. " 57 | "The output code needs to read from and write to standard IO. " 58 | "Please wrap your code answer using ```:" 59 | ) 60 | 61 | if "CodeLlama" in args.model: 62 | # make zero-shot or few-shot prompt 63 | prompt = "" 64 | for i in range(args.num_icl_shot): 65 | prompt += "Q: " + instruction + "\n" 66 | prompt += demonstration["description"][i] + "\n" 67 | prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n" 68 | prompt += "Q: " + instruction + "\n" 69 | prompt += test_data["description"].strip() + "\n" 70 | prompt += "A: ```" 71 | elif "deepseek" in args.model: 72 | prompt = "" 73 | for i in range(args.num_icl_shot): 74 | prompt += "Q: " + instruction + "\n" 75 | prompt += demonstration["description"][i] + "\n" 76 | prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n" 77 | prompt += "Q: " + instruction + "\n" 78 | prompt += test_data["description"].strip() + "\n" 79 | prompt += "A: ```" 80 | 81 | # # make zero-shot or few-shot prompt 82 | # prompt = "" 83 | # prompt += instruction + "\n" 84 | # for i in range(args.num_icl_shot): 85 | # prompt += "### Instruction:\n" + demonstration["description"][i] + "\n" 86 | # prompt += ( 87 | # "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n" 88 | # ) 89 | # prompt += "### Instruction:\n" + test_data["description"].strip() + "\n" 90 | # prompt += "### Response:\n" 91 | 92 | return prompt 93 | 94 | 95 | def extract_demonstration(train_dataset, shot, code_type): 96 | if 'transformed' not in code_type: 97 | problem_index_with_both_sc_and_mc = [] 98 | for i, data in enumerate(train_dataset): 99 | num_sc = len(data['monolithic_codes']['monolithic_code']) 100 | num_mc = len(data['modular_codes']['modular_code']) 101 | if num_sc > 0 and num_mc > 0: 102 | problem_index_with_both_sc_and_mc.append(i) 103 | 104 | demonstration = defaultdict(list) 105 | for i in random.sample(problem_index_with_both_sc_and_mc, shot): 106 | data = train_dataset[i] 107 | # modularity check 108 | # print(f'problem {i}') 109 | # tmp = [] 110 | # for code in data['modular_codes']['modular_code']: 111 | # modularity = get_code_modularity_score(code) 112 | # tmp.append(modularity) 113 | # print(tmp) 114 | if code_type == 'monolithic': 115 | demonstration['description'].append(data['problem_description'].strip()) 116 | demonstration['code'].append(data['monolithic_codes']['monolithic_code'][0].strip()) # pick the first code 117 | # print(get_code_modularity_score(data['monolithic_codes']['monolithic_code'][0])) 118 | elif code_type == 'modular': 119 | demonstration['description'].append(data['problem_description'].strip()) 120 | demonstration['code'].append(data['modular_codes']['modular_code'][0].strip()) 121 | print(get_code_modularity_score(data['modular_codes']['modular_code'][0])) 122 | print(data['modular_codes']['modular_code'][0]) 123 | 124 | return demonstration 125 | 126 | else: 127 | if code_type == 'transformed_modular': 128 | key = 'transformed_mc' 129 | elif code_type == 'transformed_monolithic': 130 | key = 'transformed_sc' 131 | 132 | demonstration = defaultdict(list) 133 | for i in range(shot): 134 | demonstration['description'].append(dataset['problem_description'][i].strip()) 135 | demonstration['code'].append(dataset[key][i].strip()) 136 | 137 | return demonstration 138 | 139 | 140 | def main(): 141 | parser = argparse.ArgumentParser() 142 | parser.add_argument("--seed", type=int, required=True, default=42) 143 | parser.add_argument( 144 | "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf" 145 | ) 146 | parser.add_argument("--num_gpu", type=int, required=True, default=1, help="total number of gpus used") 147 | parser.add_argument("--dtype", type=str, required=True, default="float16") 148 | parser.add_argument("--num_icl_shot", type=int, required=True, default=2) 149 | parser.add_argument( 150 | "--num_gen", 151 | type=int, 152 | required=True, 153 | default=1, 154 | help="number of solutions generated per problem", 155 | ) 156 | parser.add_argument( 157 | "--temperature", 158 | type=float, 159 | required=True, 160 | default=0, 161 | help="0 means greedy decoding for vllm", 162 | ) 163 | parser.add_argument("--max_new_token", type=int, required=True, default=1024) 164 | parser.add_argument("--top_p", type=float, required=True, default=0.95) 165 | parser.add_argument( 166 | "--swap_space", 167 | type=int, 168 | required=False, 169 | default=4, 170 | help="The size (GiB) of CPU memory per GPU to use as swap space", 171 | ) 172 | parser.add_argument('--code_type', type=str, required=True, default='monolithic') 173 | parser.add_argument('--degree', type=str, required=True, default='low') 174 | parser.add_argument('--debug_mode', type=int, required=True, default=0) 175 | parser.add_argument('--chkpt', type=str, required=True, default=0) 176 | # additional arguments candidiates: 177 | # max_model_len 178 | # stop 179 | # start_token, end_token 180 | args = parser.parse_args() 181 | 182 | # load model 183 | # when initializing VLLM engine, random.seed() is called internally. 184 | # so, set_seed() should be called after initializing VLLM engine. 185 | model = LLM( 186 | model=args.model, 187 | tensor_parallel_size=args.num_gpu, 188 | dtype=args.dtype, 189 | max_model_len=8192, 190 | swap_space=args.swap_space, 191 | ) 192 | 193 | # all models are fine-tuned with "Q:,, A:,," format 194 | stop = ["Q:", "A:"] 195 | 196 | sampling_params = SamplingParams( 197 | n=args.num_gen, 198 | temperature=args.temperature, 199 | top_p=args.top_p, 200 | max_tokens=args.max_new_token, 201 | stop=stop, 202 | ) 203 | 204 | # load code contest test dataset 205 | test_dataset = load_dataset( 206 | "deepmind/code_contests", 207 | split="test", 208 | ) 209 | 210 | # set seed 211 | set_seed(args.seed) 212 | 213 | base_directory = os.path.dirname(__file__) 214 | 215 | # monolithic(sc) or modular(mc) demonstration 216 | if 'transformed' not in args.code_type: 217 | dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', 'my_code_contests_divided_train.jsonl')) 218 | # transformed monolithic(tsc) or transformed modular(tmc) demonstration 219 | else: 220 | dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', f'monolithic_2shot_demonstration_{args.seed}seed.jsonl'))[0] 221 | 222 | demonstration = extract_demonstration(dataset, args.num_icl_shot, args.code_type) 223 | 224 | if "CodeLlama" in args.model: 225 | file_name = f"CodeLlama_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 226 | elif "deepseek" in args.model: 227 | file_name = f"DeepSeek_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl" 228 | 229 | 230 | if os.path.exists(os.path.join(base_directory, "result/ft", file_name)): 231 | print(f'{file_name} already exists.') 232 | return 233 | 234 | # make prompt for each test data 235 | if args.debug_mode: 236 | test_dataset = list(test_dataset)[:10] # for test 237 | 238 | prompts = [] 239 | for test_data in test_dataset: 240 | prompt = make_prompt(args, demonstration, test_data) 241 | prompts.append(prompt) 242 | 243 | # inference using vllm 244 | generations = [] 245 | solutions = [] 246 | 247 | # generate solution code using vllm 248 | outputs = model.generate( 249 | prompts, sampling_params=sampling_params, use_tqdm=True 250 | ) 251 | for idx, output in enumerate(outputs): 252 | # for each input in the prompts, args.gen_num number of outputs are generated 253 | generations_ = [outs.text.strip() for outs in output.outputs] 254 | assert len(generations_) == args.num_gen 255 | # extract solution code from generated code 256 | solutions_ = [ 257 | extract_solution(args, generation) for generation in generations_ 258 | ] 259 | if args.debug_mode: 260 | print(f'problem {idx}, prompt:') 261 | print(prompts[idx]) 262 | print('-' * 100) 263 | print('generation:') 264 | print(generations_[0].strip()) 265 | print('-' * 100) 266 | print('solution:') 267 | print(solutions_[0].strip()) 268 | print('*' * 100) 269 | 270 | # save generated solutions (list) 271 | generations.append(generations_) 272 | solutions.append(solutions_) 273 | 274 | # save generated solutions 275 | result = [] 276 | for i, test_data in enumerate(test_dataset): 277 | result.append( 278 | { 279 | "name": test_data["name"], 280 | "description": test_data["description"], 281 | "public_tests": test_data["public_tests"], 282 | "private_tests": test_data["private_tests"], 283 | "difficulty": test_data["difficulty"], 284 | "cf_rating": test_data["cf_rating"], # difficulty level 285 | "generated_solutions": generations[i], # list of generated solutions 286 | "extracted_solutions": solutions[i], 287 | "prompt": prompts[i], 288 | "demonstration": demonstration, # contains code and its description 289 | } 290 | ) 291 | 292 | if not args.debug_mode: 293 | write_dict_to_jsonl(result, os.path.join(base_directory, "result/ft", file_name)) 294 | print(f'program ends.') 295 | 296 | 297 | if __name__ == "__main__": 298 | main() 299 | -------------------------------------------------------------------------------- /apps/data/2shot_demonstration_42seed.json: -------------------------------------------------------------------------------- 1 | {"problem_id":50,"problem_description":"Karlsson has recently discovered a huge stock of berry jam jars in the basement of the house. More specifically, there were $2n$ jars of strawberry and blueberry jam.\n\nAll the $2n$ jars are arranged in a row. The stairs to the basement are exactly in the middle of that row. So when Karlsson enters the basement, he sees exactly $n$ jars to his left and $n$ jars to his right.\n\nFor example, the basement might look like this: [Image] \n\nBeing the starightforward man he is, he immediately starts eating the jam. In one minute he chooses to empty either the first non-empty jar to his left or the first non-empty jar to his right.\n\nFinally, Karlsson decided that at the end the amount of full strawberry and blueberry jam jars should become the same.\n\nFor example, this might be the result: [Image] He has eaten $1$ jar to his left and then $5$ jars to his right. There remained exactly $3$ full jars of both strawberry and blueberry jam. \n\nJars are numbered from $1$ to $2n$ from left to right, so Karlsson initially stands between jars $n$ and $n+1$.\n\nWhat is the minimum number of jars Karlsson is required to empty so that an equal number of full strawberry and blueberry jam jars is left?\n\nYour program should answer $t$ independent test cases.\n\n\n-----Input-----\n\nThe first line contains one integer $t$ ($1 \\le t \\le 1000$) \u2014 the number of test cases.\n\nThe first line of each test case contains a single integer $n$ ($1 \\le n \\le 10^5$).\n\nThe second line of each test case contains $2n$ integers $a_1, a_2, \\dots, a_{2n}$ ($1 \\le a_i \\le 2$) \u2014 $a_i=1$ means that the $i$-th jar from the left is a strawberry jam jar and $a_i=2$ means that it is a blueberry jam jar.\n\nIt is guaranteed that the sum of $n$ over all test cases does not exceed $10^5$.\n\n\n-----Output-----\n\nFor each test case print the answer to it \u2014 the minimum number of jars Karlsson is required to empty so that an equal number of full strawberry and blueberry jam jars is left.\n\n\n-----Example-----\nInput\n4\n6\n1 1 1 2 2 1 2 1 2 1 1 2\n2\n1 2 1 2\n3\n1 1 1 1 1 1\n2\n2 1 1 1\n\nOutput\n6\n0\n6\n2\n\n\n\n-----Note-----\n\nThe picture from the statement describes the first test case.\n\nIn the second test case the number of strawberry and blueberry jam jars is already equal.\n\nIn the third test case Karlsson is required to eat all $6$ jars so that there remain $0$ jars of both jams.\n\nIn the fourth test case Karlsson can empty either the second and the third jars or the third and the fourth one. The both scenarios will leave $1$ jar of both jams.","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"for tcase in range(int(input())):\n n=int(input())\n ls = list(map(int, input().split()))\n oneneed = 2*(n - ls.count(1))\n ldct = {0:0}\n ctr = 0\n eaten = 0\n for i in range(n-1,-1,-1):\n eaten += 1\n ctr += (1 if ls[i] == 2 else -1)\n if ctr not in ldct:\n ldct[ctr] = eaten\n\n rdct = {0:0}\n ctr = 0\n eaten = 0\n for i in range(n,2*n):\n eaten += 1\n ctr += (1 if ls[i] == 2 else -1)\n if ctr not in rdct:\n rdct[ctr] = eaten\n \n\n best=99**99\n for k in list(rdct.keys()):\n otk = oneneed - k\n if otk in ldct:\n best = min(best, rdct[k]+ldct[otk])\n print(best)","sc_cc":10.0,"mc":"import sys\n\ndef minp():\n\treturn sys.stdin.readline().strip()\n\ndef mint():\n\treturn int(minp())\n\ndef mints():\n\treturn list(map(int,minp().split()))\n\ndef solve():\n\tn = mint()\n\ta = list(mints())\n\tc = dict()\n\tc[0] = 2*n\n\td = 0\n\t\n\tfor i in range(2*n-1,n-1,-1):\n\t\tif a[i] == 1:\n\t\t\td += 1\n\t\telse:\n\t\t\td -= 1\n\t\t\n\t\tc[d] = i\n\t\n\td = 0\n\tr = 2*n\n\tr = min(r, n + c[0] - n)\n\tfor i in range(n):\n\t\tif a[i] == 1:\n\t\t\td += 1\n\t\telse:\n\t\t\td -= 1\n\t\t\n\t\tif (-d) in c:\n\t\t\tr = min(r, n - i - 1 + c[-d] - n)\n\t\n\treturn r\n\n\nfor i in range(mint()):\n\tprint(solve())","mc_cc":2.2,"transformed_mc":["\ndef find_jars_to_empty(t, test_cases):\n result = []\n for i in range(t):\n n = test_cases[i][0]\n ls = test_cases[i][1]\n oneneed = 2 * (n - ls.count(1))\n ldct, rdct = get_counts(n, ls)\n best = find_min_jars(ldct, rdct, oneneed)\n result.append(best)\n return result\n\ndef get_counts(n, ls):\n ldct = {0: 0}\n rdct = {0: 0}\n ctr = 0\n eaten = 0\n for i in range(n-1, -1, -1):\n eaten += 1\n ctr += (1 if ls[i] == 2 else -1)\n if ctr not in ldct:\n ldct[ctr] = eaten\n\n ctr = 0\n eaten = 0\n for i in range(n, 2*n):\n eaten += 1\n ctr += (1 if ls[i] == 2 else -1)\n if ctr not in rdct:\n rdct[ctr] = eaten\n return ldct, rdct\n\ndef find_min_jars(ldct, rdct, oneneed):\n best = 99**99\n for k in list(rdct.keys()):\n otk = oneneed - k\n if otk in ldct:\n best = min(best, rdct[k] + ldct[otk])\n return best\n\ndef main():\n t = int(input())\n test_cases = []\n for _ in range(t):\n n = int(input())\n jars = list(map(int, input().split()))\n test_cases.append((n, jars))\n\n result = find_jars_to_empty(t, test_cases)\n for res in result:\n print(res)\n\nif __name__ == '__main__':\n main()\n"],"transformed_sc":["t = int(input())\ntest_cases = []\nfor _ in range(t):\n n = int(input())\n jars = list(map(int, input().split()))\n test_cases.append((n, jars))\nresult = []\nfor i in range(t):\n n = test_cases[i][0]\n ls = test_cases[i][1]\n oneneed = 2 * (n - ls.count(1))\n ldct = {0: 0}\n rdct = {0: 0}\n ctr = 0\n eaten = 0\n for j in range(n-1, -1, -1):\n eaten += 1\n ctr += (1 if ls[j] == 2 else -1)\n if ctr not in ldct:\n ldct[ctr] = eaten\n ctr = 0\n eaten = 0\n for j in range(n, 2*n):\n eaten += 1\n ctr += (1 if ls[j] == 2 else -1)\n if ctr not in rdct:\n rdct[ctr] = eaten\n best = 99**99\n for k in list(rdct.keys()):\n otk = oneneed - k\n if otk in ldct:\n best = min(best, rdct[k] + ldct[otk])\n result.append(best)\nfor res in result:\n print(res)"]} 2 | {"problem_id":692,"problem_description":"There is crazy man named P29892P. He always tries to do crazy things as he thinks. One day he invented a machine and named it as ANGEN. The ANGEN is used to perform range operations. The range operation means performing operations on range values from {L..R}. At any time it performs operations on values in between L to R.\nANGEN can perform following operations\nU I V - Update the value present at I with value V\nA L R - Find the sum between range L and R\nM L R - Find the maximum number between L and R\nm L R - Find the minimum number between L and R\nS L R - Find second maximum value in between L and R\ns L R - Find second mimimum value in between L and R\nIf it is not possible perform operation ANGEN returns \u201cNA\u201d with out quotes.\nFor Invalid operations ANGEN returns \u201c!!!\u201d with out quotes.\nNow P29892P challenging his friends and you too, to build his invention with yourown code. So it's your time to defeat P29892P by implementing his invention with your own ability. Let's go and solve the problem.\n\n-----Input-----\nInput description.\n- The first line of the input contains an integer N denoting the number of integers. \n- The next line contains N space separated integers.\"\n- The next line contains a single integer Q denoting the number of Questions.\n- The next Q lines contains T Question type , L and R.\n\n-----Output-----\nPrint output for each question in separate line.\n\n-----Constraints-----\nShould contain all the constraints on the input data that you may have. Format it like:\n- 1 \u2264 N \u2264 100000\n- 1 \u2264 values[i] \u2264 1000000000\n- 1 \u2264 Q \u2264 10000\n- T in { A..Z, a..z }\n- 1 \u2264 L \u2264 R \u2264 N\n\n-----Example-----\nInput:\n6\n1 2 5 3 10 6\n6\nA 1 5\nM 1 3\nm 5 6\ns 3 6\nU 1 7\nS 1 2\n\nOutput:\n21\n5\n6\n5\n2\n\n-----Explanation-----\n...","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"n = eval(input())\narr = list(map(int,input().split()))\nq = eval(input())\nwhile q:\n q -= 1\n ar = input().split()\n t = ar[0]\n l = int(ar[1])\n r = int(ar[2])\n l -= 1\n if t == 'U':\n arr[l] = r\n elif t == 'A':\n print(sum(arr[l:r]))\n elif t == 'M':\n print(max(arr[l:r]))\n elif t == 'm':\n print(min(arr[l:r]))\n elif t == 'S':\n m = max(arr[l:r])\n m2 = -1\n for i in range(l, r):\n if arr[i] < m and arr[i] > m2:\n m2 = arr[i]\n print(m2)\n elif t == 's':\n m = min(arr[l:r])\n m2 = 1000000000000\n for i in range(l, r):\n if arr[i] > m and arr[i] < m2:\n m2 = arr[i]\n print(m2)","sc_cc":14.0,"mc":"def na(l,r):\n if l-1>=r or l>n or r>n:\n return True\n else:\n return False\n\ndef na_print():\n print('NA')\n\n\n\ndef func(t,l,r):\n if t=='U':\n try:\n a[l-1]=r\n return ''\n except:\n print('NA')\n return ''\n elif na(l,r):\n na_print()\n return ''\n \n if t=='A':\n print(sum(a[l-1:r]))\n \n elif t=='M':\n print(max(a[l-1:r]))\n \n elif t=='m':\n print(min(a[l-1:r]))\n \n elif t=='S':\n bb = list(set(a[l-1:r]))\n bb.sort()\n print(bb[-2])\n \n elif t=='s':\n bb = list(set(a[l-1:r]))\n bb.sort()\n print(bb[1])\n \n else:\n print('!!!')\n\n\n\n\nn = eval(input())\na = list(map(int, input().split()))\nq = eval(input())\n\nfor i in range(q):\n t,l,r = input().split()\n func(t,int(l),int(r))","mc_cc":4.0,"transformed_mc":["\ndef update_value(arr, idx, val):\n arr[idx] = val\n\ndef find_sum(arr, start, end):\n return sum(arr[start:end])\n\ndef find_max(arr, start, end):\n return max(arr[start:end])\n\ndef find_min(arr, start, end):\n return min(arr[start:end])\n\ndef find_second_max(arr, start, end):\n max_val = max(arr[start:end])\n second_max = -1\n for i in range(start, end):\n if arr[i] < max_val and arr[i] > second_max:\n second_max = arr[i]\n return second_max\n\ndef find_second_min(arr, start, end):\n min_val = min(arr[start:end])\n second_min = 1000000000000\n for i in range(start, end):\n if arr[i] > min_val and arr[i] < second_min:\n second_min = arr[i]\n return second_min\n\ndef main():\n n = int(input())\n arr = list(map(int, input().split()))\n q = int(input())\n \n while q:\n q -= 1\n ar = input().split()\n t = ar[0]\n l = int(ar[1]) - 1\n r = int(ar[2])\n \n if t == 'U':\n update_value(arr, l, r)\n elif t == 'A':\n print(find_sum(arr, l, r))\n elif t == 'M':\n print(find_max(arr, l, r))\n elif t == 'm':\n print(find_min(arr, l, r))\n elif t == 'S':\n print(find_second_max(arr, l, r))\n elif t == 's':\n print(find_second_min(arr, l, r))\n\nif __name__ == '__main__':\n main()\n"],"transformed_sc":["n = int(input())\narr = list(map(int, input().split()))\nq = int(input())\n\nwhile q:\n q -= 1\n ar = input().split()\n t = ar[0]\n l = int(ar[1]) - 1\n r = int(ar[2])\n \n if t == 'U':\n arr[l] = r\n elif t == 'A':\n print(sum(arr[l:r]))\n elif t == 'M':\n print(max(arr[l:r]))\n elif t == 'm':\n print(min(arr[l:r]))\n elif t == 'S':\n max_val = max(arr[l:r])\n second_max = -1\n for i in range(l, r):\n if arr[i] < max_val and arr[i] > second_max:\n second_max = arr[i]\n print(second_max)\n elif t == 's':\n min_val = min(arr[l:r])\n second_min = 1000000000000\n for i in range(l, r):\n if arr[i] > min_val and arr[i] < second_min:\n second_min = arr[i]\n print(second_min)"]} 3 | --------------------------------------------------------------------------------