├── apps
    ├── eval
    │   ├── __init__.py
    │   └── apps_metric.py
    ├── eval.sh
    ├── make_demonstration.py
    ├── filter.py
    ├── utils.py
    ├── eval.py
    ├── sc2tmc.py
    ├── icl.sh
    ├── data
    │   ├── 2shot_demonstration_101seed.json
    │   ├── 2shot_demonstration_27seed.json
    │   └── 2shot_demonstration_42seed.json
    └── icl.py
├── README.md
└── codecontests
    ├── icl_corr.sh
    ├── evaluate_corr.sh
    ├── ppl.sh
    ├── evaluate.sh
    ├── preprocess_original_dataset_ft.py
    ├── icl_gpt.sh
    ├── ft.sh
    ├── preprocess_original_dataset_icl.py
    ├── ppl.py
    ├── evaluate_.py
    ├── evaluate_gpt.py
    ├── utils
        └── utils_evaluate.py
    ├── calculate_corr.ipynb
    ├── evaluate_ft.py
    ├── evaluate_corr.py
    ├── construct_mc_sc_divided_dataset.ipynb
    ├── icl_ft.sh
    ├── data
        ├── monolithic_2shot_demonstration_169seed.jsonl
        ├── monolithic_2shot_demonstration_134seed.jsonl
        └── monolithic_2shot_demonstration_42seed.jsonl
    ├── construct_demonstration_for_correlation_experiment.ipynb
    ├── calculate_corr_between_mos_and_function_call.ipynb
    ├── sc2mc.py
    ├── icl_corr.py
    ├── icl.py
    └── icl_ft.py


/apps/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Revisiting the Impact of Pursuing Modularity for Code Generation
 2 | Official Repository for "Revisiting the Impact of Pursuing Modularity for Code Generation" [[Paper(arXiv)](https://arxiv.org/abs/2407.11406)]
 3 | 
 4 | Deokyeong Kang, Kijung Seo, Taeuk Kim. _**Accepted to EMNLP 2024 Findings**_
 5 | 
 6 | 
 7 | ## Contents
 8 | 
 9 | * apps: source codes for APPS dataset
10 |   * In-Context Learning: icl.py
11 | * codecontests: source codes for CodeContests dataset
12 |   * Modularity score (MoS) metric: utils/utils.py
13 |   * In-Context Learning: icl.py
14 |   * Fine-tuning: ft.py
15 |   * Correlation experiment: icl_corr.py
16 |   * Perplexity experiment: ppl.py
17 | 


--------------------------------------------------------------------------------
/codecontests/icl_corr.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | # inference for correlation experiment
 4 | # CL 7b, pass@1(n=10)
 5 | seed=42
 6 | size=7
 7 | model=meta-llama/CodeLlama-${size}b-hf
 8 | # size=6.7
 9 | # model=deepseek-ai/deepseek-coder-${size}b-base
10 | num_gpu=4
11 | dtype=float16
12 | num_icl_shot=1
13 | num_gen=10
14 | temperature=0.1
15 | swap_space=8
16 | for metric in var_len; do
17 |     CUDA_VISIBLE_DEVICES=0,1,2,3 python icl_corr.py \
18 |     --seed ${seed} \
19 |     --model ${model} \
20 |     --num_gpu ${num_gpu} \
21 |     --dtype ${dtype} \
22 |     --num_icl_shot ${num_icl_shot} \
23 |     --num_gen ${num_gen} \
24 |     --temperature ${temperature} \
25 |     --max_new_token 1024 \
26 |     --top_p 0.95 \
27 |     --swap_space ${swap_space} \
28 |     --metric ${metric} \
29 |     > log/inference/cl${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${metric}.log 2>&1
30 |     echo cl${size}b ${metric} inference ends
31 | done
32 | 
33 | 


--------------------------------------------------------------------------------
/codecontests/evaluate_corr.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | # CL
 4 | size=7
 5 | model=meta-llama/CodeLlama-${size}b-hf
 6 | num_icl_shot=1
 7 | num_gen=10
 8 | temperature=0.1
 9 | k=1
10 | 
11 | for metric in var_len; do
12 |     python evaluate_corr.py \
13 |     --model ${model} \
14 |     --num_icl_shot ${num_icl_shot} \
15 |     --num_gen ${num_gen} \
16 |     --temperature ${temperature} \
17 |     --metric ${metric} \
18 |     --k ${k} \
19 |     > log/evaluation/cl${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${metric}_corr.log 2>&1
20 |     echo cl${size}b ${metric} score correlation evaluation ends
21 | done
22 | 
23 | 
24 | # # DS
25 | # size=6.7
26 | # model=deepseek-ai/deepseek-coder-${size}b-base
27 | # num_gpu=1
28 | # num_icl_shot=1
29 | # num_gen=10
30 | # temperature=0.1
31 | # k=1
32 | 
33 | # for metric in style modularity; do
34 | #     python evaluate_corr.py \
35 | #     --model ${model} \
36 | #     --num_icl_shot ${num_icl_shot} \
37 | #     --num_gen ${num_gen} \
38 | #     --temperature ${temperature} \
39 | #     --metric ${metric} \
40 | #     --k ${k} \
41 | #     > log/evaluation/ds${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${metric}_corr.log 2>&1
42 | #     echo ds${size}b ${metric} score correlation evaluation ends
43 | # done


--------------------------------------------------------------------------------
/apps/eval.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | temperature=0.1
 4 | code_type_=(mc sc tmc tsc)
 5 | model_name=$1
 6 | if [ ${model_name} == deepseek ]; then
 7 |     model=deepseek-ai/deepseek-coder-6.7b-base
 8 | else
 9 |     model=meta-llama/CodeLlama-7b-hf
10 | fi
11 | 
12 | task() {
13 |     local seed=$1
14 |     for code_type in ${code_type_}; do
15 |         python -u eval.py  --seed ${seed} \
16 |         --model ${model} --num_icl_shot 2 \
17 |         --num_gen 10 --code_type ${code_type} \
18 |         --temperature ${temperature}  --modify original \
19 |         > log/evaluation/meta-llama-CodeLlama-7b-hf_${codetype}_original_${num_icl_shot}shot_10gen_${temperature}temp_${seed}.log 2>&1
20 |     done
21 |     task_completed $seed
22 | }
23 | 
24 | task_completed() {
25 |     local seed=$1
26 |     # Start task1 for the next seed
27 |     next_seed=$(next_seed $seed)
28 |     if [ -n "$next_seed" ]; then
29 |         task1 $next_seed &
30 |     fi
31 | }
32 | 
33 | 
34 | next_seed() {
35 |     local seed=$1
36 |     case $seed in
37 |         27) echo 42 ;;
38 |         42) echo 101 ;;
39 |         101) echo 134 ;;
40 |         134) echo 169 ;;
41 |         169) echo "" ;;
42 |     esac
43 | }
44 | 
45 | 
46 | # Start the first tasks
47 | task 27 &
48 | 
49 | # Wait for all background jobs to finish
50 | wait


--------------------------------------------------------------------------------
/codecontests/ppl.sh:
--------------------------------------------------------------------------------
 1 | (nohup python perplexity.py --gpu 0 --model meta-llama/CodeLlama-7b-hf --mod low --include_prompt > log/ppl_include_prompt/cl7b_low_mod.log 2>&1) &
 2 | (nohup python perplexity.py --gpu 1 --model meta-llama/CodeLlama-7b-hf --mod high --include_prompt > log/ppl_include_prompt/cl7b_high_mod.log 2>&1) &
 3 | (nohup python perplexity.py --gpu 2 --model deepseek-ai/deepseek-coder-6.7b-base --mod low --include_prompt > log/ppl_include_prompt/ds7b_low_mod.log 2>&1) &
 4 | (nohup python perplexity.py --gpu 3 --model deepseek-ai/deepseek-coder-6.7b-base --mod high --include_prompt > log/ppl_include_prompt/ds7b_high_mod.log 2>&1) &
 5 | wait &&
 6 | echo 7b model done!
 7 | (nohup python perplexity.py --gpu 0 --model meta-llama/CodeLlama-34b-hf --mod low --include_prompt > log/ppl_include_prompt/cl34b_low_mod.log 2>&1) &
 8 | (nohup python perplexity.py --gpu 1 --model meta-llama/CodeLlama-34b-hf --mod high --include_prompt > log/ppl_include_prompt/cl34b_high_mod.log 2>&1) &
 9 | (nohup python perplexity.py --gpu 2 --model deepseek-ai/deepseek-coder-33b-base --mod low --include_prompt > log/ppl_include_prompt/ds33b_low_mod.log 2>&1) &
10 | (nohup python perplexity.py --gpu 3 --model deepseek-ai/deepseek-coder-33b-base --mod high --include_prompt > log/ppl_include_prompt/ds33b_high_mod.log 2>&1) &
11 | echo 33b model in progress!


--------------------------------------------------------------------------------
/apps/make_demonstration.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import torch
 4 | import os
 5 | from datasets import Dataset
 6 | from collections import defaultdict
 7 | 
 8 | 
 9 | def set_seed(seed):
10 |     random.seed(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     torch.cuda.manual_seed(seed)
14 |     # When running on the CuDNN backend, two further options must be set
15 |     torch.backends.cudnn.deterministic = True
16 |     torch.backends.cudnn.benchmark = False
17 |     # Set a fixed value for the hash seed
18 |     os.environ["PYTHONHASHSEED"] = str(seed)
19 | 
20 | 
21 | def extract_demonstration(train_dataset):
22 |     demonstration = defaultdict(list)
23 |     for i in sorted(random.sample(list(range(len(train_dataset))), 2)):
24 |         data = train_dataset[i]
25 |         if data["starter_code"] != "":
26 |             question_guide = "use the provided function signature"
27 |         else:
28 |             question_guide = "read from and write to standard IO"
29 |         sc_instruction = (
30 |             "Write a python code to solve the following coding problem "
31 |             "that obeys the constraints and passes the example test cases. "
32 |             f"The output code needs to {question_guide}. "
33 |             "Please wrap your code answer using ```:"
34 |         )
35 |         mc_instruction = (
36 |             "Write a python code to solve the following coding problem "
37 |             "that obeys the constraints and passes the example test cases. "
38 |             f"The output code needs to {question_guide}. "
39 |             "Ensure modularity of the python code by dividing the code into smaller, "
40 |             "useful functions to solve the given problem. "
41 |             "Please wrap your code answer using ```:"
42 |         )
43 | 
44 |         demonstration["problem_id"].append(data["problem_id"])
45 |         demonstration["problem_description"].append(data["question"].strip())
46 |         demonstration["starter_code"].append(data["starter_code"])
47 |         demonstration["sc_instruction"].append(sc_instruction)
48 |         demonstration["mc_instruction"].append(mc_instruction)
49 |         demonstration["sc"].append(data["sc"][0].strip())
50 |         demonstration["sc_cc"].append(data["sc_cc"][0])
51 |         demonstration["mc"].append(data["mc"][0].strip())
52 |         demonstration["mc_cc"].append(data["mc_cc"][0])
53 | 
54 |     return demonstration
55 | 
56 | 
57 | for seed in [27, 42, 101, 134, 169]:
58 |     set_seed(seed)
59 |     dataset = Dataset.from_json("data/filtered_APPS.json")
60 |     demonstration = extract_demonstration(dataset)
61 |     Dataset.from_dict(demonstration).to_json(
62 |         f"data/2shot_demonstration_{seed}seed.json"
63 |     )
64 | 


--------------------------------------------------------------------------------
/codecontests/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | # # CL
 4 | # num_icl_shot=0
 5 | # num_gen=1
 6 | # temperature=0.1
 7 | # k=1
 8 | 
 9 | # for size in 34; do
10 | #     for seed in 27 42 101 134 169; do
11 | #         python evaluate_.py \
12 | #         --model meta-llama/CodeLlama-${size}b-hf \
13 | #         --seed ${seed} \
14 | #         --num_icl_shot ${num_icl_shot} \
15 | #         --num_gen ${num_gen} \
16 | #         --temperature ${temperature} \
17 | #         --k ${k} \
18 | #         > log/evaluation/2shot_mc/cl${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
19 | #         echo cl${size}b ${num_icl_shot} mc ${seed}seed evaluation ends
20 | #     done
21 | # done
22 | 
23 | # # DS
24 | # num_icl_shot=2
25 | # num_gen=50
26 | # temperature=0.6
27 | # k=10
28 | # for size in 33; do
29 | #     for seed in 27 42 101 134 169; do
30 | #         python evaluate_.py \
31 | #         --model deepseek-ai-deepseek-coder-${size}b-base \
32 | #         --seed ${seed} \
33 | #         --num_icl_shot ${num_icl_shot} \
34 | #         --num_gen ${num_gen} \
35 | #         --temperature ${temperature} \
36 | #         --k ${k} \
37 | #         > log/evaluation/2shot_mc/ds${size}b_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
38 | #         echo ds${size}b ${num_icl_shot} mc ${seed}seed evaluation ends
39 | #     done
40 | # done
41 | 
42 | 
43 | # inference after fine-tuning
44 | num_icl_shot=0
45 | num_gen=50
46 | temperature=0.6
47 | k=1
48 | # degree=low
49 | debug_mode=0
50 | chkpt=_final
51 | 
52 | for degree in low high; do
53 |     for seed in 27; do
54 |         python evaluate_ft.py \
55 |         --model meta-llama/CodeLlama-7b-hf \
56 |         --seed ${seed} \
57 |         --num_icl_shot ${num_icl_shot} \
58 |         --num_gen ${num_gen} \
59 |         --temperature ${temperature} \
60 |         --k ${k} \
61 |         --degree ${degree} \
62 |         --chkpt ${chkpt} \
63 |         > log/evaluation/tmp/CodeLlama_${degree}_mod_chkpt${chkpt}_${num_icl_shot}shot_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 &
64 |     done
65 | done
66 | 
67 | # --model meta-llama/CodeLlama-7b-hf \
68 | # --model /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp
69 | 
70 | 
71 | # # gpt
72 | # num_icl_shot=2
73 | # num_gen=10
74 | # temperature=0.1
75 | # k=1
76 | 
77 | # for code_type in monolithic modular transformed_modular transformed_monolithic; do
78 | #     for seed in 134; do
79 | #         python evaluate_gpt.py \
80 | #         --model gpt-4o-mini \
81 | #         --code_type ${code_type} \
82 | #         --seed ${seed} \
83 | #         --num_icl_shot ${num_icl_shot} \
84 | #         --num_gen ${num_gen} \
85 | #         --temperature ${temperature} \
86 | #         --k ${k} \
87 | #         >> log/evaluation/gpt/gpt-4o-mini_${code_type}_code_${num_icl_shot}shot_${temperature}temp_${num_gen}gen.log 2>&1
88 | #     done
89 | # done
90 | 


--------------------------------------------------------------------------------
/apps/filter.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset, concatenate_datasets
 2 | import json
 3 | from utils import *
 4 | from radon.complexity import cc_visit
 5 | from eval.apps_metric import apps_metric
 6 | import os
 7 | 
 8 | def filtering(dataset):
 9 |     words = ["codeforces", "atcoder", "codechef"]
10 |     dataset = dataset.filter(lambda x: any(word in x["url"] for word in words))
11 |     dataset = make_solution_column(dataset)
12 | 
13 |     if os.path.exists(
14 |         "data/apps_results.json"
15 |     ):
16 |         results = json.load(
17 |             open(
18 |                 "data/apps_results.json",
19 |                 "r",
20 |             )
21 |         )
22 |     else:
23 |         eval_apps = apps_metric()
24 |         results, _ = eval_apps._compute(
25 |             dataset, k_list=[1], split="train", column_name="solution"
26 |         )
27 |         json.dump(
28 |             results,
29 |             open(
30 |                 "data/apps_results.json",
31 |                 "w",
32 |             ),
33 |         )
34 | 
35 |     data = []
36 |     for index in results:
37 |         sc = []
38 |         sc_cc = []
39 |         mc = []
40 |         mc_cc = []
41 |         cc_criteria = 10
42 |         for i, result in enumerate(results[index]):
43 |             try:
44 |                 code = process_text(dataset[int(index)]["solution"][i])
45 |                 code_cc = get_avg_cc(code)
46 |                 if all(x == True for x in result):
47 |                     if code_cc >= cc_criteria:
48 |                         sc.append(code)
49 |                         sc_cc.append(code_cc)
50 |                     else:
51 |                         visit = cc_visit(code)
52 |                         count = [
53 |                             count_module_written(code, func.name)
54 |                             for func in visit.functions
55 |                         ]
56 |                         TF = all(x >= 2 for x in count)
57 |                         if len(count) >= 3 and TF:
58 |                             mc.append(code)
59 |                             mc_cc.append(code_cc)
60 |             except:
61 |                 pass
62 |         data.append({"mc": mc, "mc_cc": mc_cc, "sc": sc, "sc_cc": sc_cc})
63 | 
64 |     final_data = concatenate_datasets([dataset, Dataset.from_list(data)], axis=1)
65 |     final_data = final_data.filter(
66 |         lambda x: x["sc"] != []
67 |         and x["mc"] != []
68 |         and -10 not in x["sc_cc"]
69 |         and -10 not in x["mc_cc"]
70 |     )
71 | 
72 |     return final_data
73 | 
74 | 
75 | def main():
76 | 
77 |     dataset_name = "codeparrot/apps"
78 | 
79 |     dataset = load_dataset(
80 |         dataset_name,
81 |         trust_remote_code=True,
82 |         split="train",
83 |     )
84 | 
85 |     filtered_dataset = filtering(dataset)
86 |     filtered_dataset.to_json(
87 |         f"data/filtered_APPS.json"
88 |     )
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     main()
93 | 


--------------------------------------------------------------------------------
/codecontests/preprocess_original_dataset_ft.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | from datasets import load_dataset
 4 | from utils.utils_evaluate import safe_eval_answer_from_agent_ft
 5 | from utils.utils import get_code_modularity_score
 6 | 
 7 | 
 8 | # delete all solutions in another language except python in the dataset
 9 | def leave_python_solution(example):
10 |     solutions = example['solutions']['solution']
11 |     language_index = example['solutions']['language']
12 | 
13 |     python_solution = []
14 |     for i, lang in enumerate(language_index):
15 |         if lang == 3: # python3
16 |             python_solution.append(solutions[i])
17 | 
18 |     example['solutions']['solution'] = python_solution
19 |     del example['solutions']['language']
20 |     return example
21 | 
22 | 
23 | # remove annotated parts in the code
24 | def remove_annotation(example):
25 |     def remove_annotation_(input_string):
26 |         modified_string = re.sub(r"#.*?(?=\n)", '', input_string)
27 |         modified_string = re.sub(r"'''.*?'''", '', modified_string, flags=re.DOTALL)
28 |         modified_string = re.sub(r'""".*?"""', '', modified_string, flags=re.DOTALL)
29 |         return modified_string
30 | 
31 |     for i in range(len(example['solutions']['solution'])):
32 |         example['solutions']['solution'][i] = remove_annotation_(example['solutions']['solution'][i])
33 | 
34 |     return example
35 | 
36 | 
37 | def calculate_mos(example):
38 |     scores = []
39 |     for code in example['solutions']['solution']:
40 |         try:
41 |             modularity_score = get_code_modularity_score(code.strip())
42 |         except:
43 |             modularity_score = -1
44 |             
45 |         scores.append(modularity_score)
46 |         
47 |     example['solutions']['modularity'] = scores
48 | 
49 |     return example
50 | 
51 | 
52 | def start(split):
53 |     base_dir = os.path.dirname(__file__)
54 |     
55 |     # load original dataset
56 |     dataset = load_dataset("deepmind/code_contests")
57 |     dataset = dataset[split]
58 |     # dataset = dataset[split].select(range(5)) # for test
59 |     print(f'len(dataset): {len(dataset)}')
60 |     # 1. filter questions without any python solution
61 |     print('1')
62 |     dataset = dataset.filter(lambda example: 3 in example['solutions']['language'])
63 |     # 2. retain only python solutions in problem
64 |     print('2')
65 |     dataset = dataset.map(leave_python_solution, num_proc=16)
66 |     # 3. remove annotation in the code
67 |     print('3')
68 |     dataset = dataset.map(remove_annotation, num_proc=16)
69 |     # 4. retain only python solutions that pass the test cases
70 |     print('4')
71 |     dataset = dataset.map(safe_eval_answer_from_agent_ft, num_proc=16)
72 |     # 5. calculate MoS score of code
73 |     print('5')
74 |     dataset = dataset.map(calculate_mos, num_proc=16)
75 |     # 6. save
76 |     dataset.to_json(os.path.join(base_dir, 'data/ft', f'my_code_contests_{split}.jsonl'))
77 | 
78 | 
79 | # start('test')
80 | start('valid')
81 | # start('train')


--------------------------------------------------------------------------------
/apps/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from radon.complexity import cc_visit
  4 | 
  5 | def write_dict_to_jsonl(dictionary, filename):
  6 |     import json
  7 | 
  8 |     with open(filename, "a") as file:
  9 |         for item in dictionary:
 10 |             json.dump(item, file)
 11 |             file.write("\n")
 12 | 
 13 | 
 14 | def read_jsonl_to_dict(filename):
 15 |     import json
 16 | 
 17 |     result = []
 18 |     with open(filename, "r") as file:
 19 |         for line in file:
 20 |             item = json.loads(line.strip())
 21 |             result.append(item)
 22 |     return result
 23 | 
 24 | 
 25 | def count_module_written(code, module):
 26 |     indices = []
 27 |     index = -1
 28 |     # find all parts starting with module name in the code
 29 |     while True:
 30 |         index = code.find(module, index + 1)
 31 |         if index == -1:
 32 |             break
 33 |         indices.append(index)
 34 | 
 35 |     # filter
 36 |     permit_left_char = [
 37 |         " ",
 38 |         "(",
 39 |         ":",
 40 |         "+",
 41 |         "-",
 42 |         "*",
 43 |         "/",
 44 |         "//",
 45 |         "%",
 46 |         "=",
 47 |         "<",
 48 |         ">",
 49 |         "!",
 50 |         "~",
 51 |         "&",
 52 |         "|",
 53 |         "^",
 54 |     ]
 55 |     permit_right_char = [" ", "("]
 56 |     cnt = 0
 57 |     for index in indices:
 58 |         if (
 59 |             code[index - 1] in permit_left_char
 60 |             and code[index + len(module)] in permit_right_char
 61 |         ):
 62 |             cnt += 1
 63 | 
 64 |     return cnt
 65 | 
 66 | 
 67 | # calculate average cc of code
 68 | def get_avg_cc(code):
 69 |     try:
 70 |         visitor = cc_visit(code)
 71 | 
 72 |         # 1. average cc of modules
 73 |         total_module_complexity = 0
 74 |         num_module = 0
 75 |         for module in visitor.blocks:
 76 |             # only consider function or method of class as module
 77 |             if module.__class__.__name__ == "Function":
 78 |                 total_module_complexity += module.complexity
 79 |                 num_module += 1
 80 | 
 81 |         # 2. cc of body code
 82 |         body_complexity = visitor.complexity
 83 | 
 84 |         # 3. average cc of the program
 85 |         avg_cc = (total_module_complexity + body_complexity) / (num_module + 1)
 86 |     except:
 87 |         # cc_visit fails to return because the input code has some errors
 88 |         avg_cc = -10
 89 | 
 90 |     return avg_cc
 91 | 
 92 | 
 93 | def process_text(input_string):
 94 |     modified_string = re.sub(r"#.*?(?=\n)", "", input_string)
 95 |     modified_string = re.sub(r"'''.*?'''", "", modified_string, flags=re.DOTALL)
 96 |     modified_string = re.sub(r'""".*?"""', "", modified_string, flags=re.DOTALL)
 97 |     return modified_string
 98 | 
 99 | 
100 | def make_solution_column(dataset):
101 |     solution = []
102 |     for problem in dataset:
103 |         solution.append(json.loads(problem["solutions"]))
104 |     dataset = dataset.add_column("solution", solution)
105 |     return dataset


--------------------------------------------------------------------------------
/codecontests/icl_gpt.sh:
--------------------------------------------------------------------------------
 1 | # lets go
 2 | model=gpt-4o-mini
 3 | num_icl_shot=2
 4 | num_gen=10
 5 | temperature=0.1
 6 | debug_mode=0
 7 | 
 8 | # for code_type in monolithic; do
 9 | #     for seed in 27 42 101 134; do
10 | #         python icl_gpt.py \
11 | #         --seed ${seed} \
12 | #         --model ${model} \
13 | #         --num_icl_shot ${num_icl_shot} \
14 | #         --num_gen ${num_gen} \
15 | #         --temperature ${temperature} \
16 | #         --max_new_token 1024 \
17 | #         --code_type ${code_type} \
18 | #         --debug_mode ${debug_mode} \
19 | #         > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
20 | #         echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends
21 | #     done
22 | # done
23 | 
24 | # for code_type in modular; do
25 | #     for seed in 27 42 101 134 169; do
26 | #         python icl_gpt.py \
27 | #         --seed ${seed} \
28 | #         --model ${model} \
29 | #         --num_icl_shot ${num_icl_shot} \
30 | #         --num_gen ${num_gen} \
31 | #         --temperature ${temperature} \
32 | #         --max_new_token 1024 \
33 | #         --code_type ${code_type} \
34 | #         --debug_mode ${debug_mode} \
35 | #         > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
36 | #         echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends
37 | #     done
38 | # done
39 | 
40 | # for code_type in transformed_monolithic; do
41 | #     for seed in 27 42 101 134 169; do
42 | #         python icl_gpt.py \
43 | #         --seed ${seed} \
44 | #         --model ${model} \
45 | #         --num_icl_shot ${num_icl_shot} \
46 | #         --num_gen ${num_gen} \
47 | #         --temperature ${temperature} \
48 | #         --max_new_token 1024 \
49 | #         --code_type ${code_type} \
50 | #         --debug_mode ${debug_mode} \
51 | #         > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
52 | #         echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends
53 | #     done
54 | # done
55 | 
56 | for code_type in transformed_modular; do
57 |     for seed in 27 42 134 169; do
58 |         python icl_gpt.py \
59 |         --seed ${seed} \
60 |         --model ${model} \
61 |         --num_icl_shot ${num_icl_shot} \
62 |         --num_gen ${num_gen} \
63 |         --temperature ${temperature} \
64 |         --max_new_token 1024 \
65 |         --code_type ${code_type} \
66 |         --debug_mode ${debug_mode} \
67 |         > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
68 |         echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends
69 |     done
70 | done
71 | 
72 | 
73 | # # # for test
74 | # model=gpt-4o-mini
75 | # num_icl_shot=2
76 | # num_gen=10
77 | # temperature=0.1
78 | # debug_mode=0
79 | 
80 | # for code_type in transformed_modular; do
81 | #     for seed in 101; do
82 | #         python icl_gpt.py \
83 | #         --seed ${seed} \
84 | #         --model ${model} \
85 | #         --num_icl_shot ${num_icl_shot} \
86 | #         --num_gen ${num_gen} \
87 | #         --temperature ${temperature} \
88 | #         --max_new_token 1024 \
89 | #         --code_type ${code_type} \
90 | #         --debug_mode ${debug_mode} \
91 | #         > log/inference/gpt/gpt-4o-mini_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
92 | #         echo ${model} ${num_icl_shot}shot ${code_type} inference ${seed} ends
93 | #     done
94 | # done


--------------------------------------------------------------------------------
/apps/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from eval.apps_metric import apps_metric
  4 | from eval.utils import get_results
  5 | import argparse
  6 | from datasets import Dataset
  7 | import re
  8 | import os
  9 | import argparse
 10 | 
 11 | from tqdm import tqdm
 12 | 
 13 | from utils import read_jsonl_to_dict, write_dict_to_jsonl
 14 | 
 15 | 
 16 | def main():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument("--seed", type=int, default=42)
 19 |     parser.add_argument("--model", type=str, default="meta-llama/CodeLlama-7b-hf")
 20 |     parser.add_argument("--num_icl_shot", type=int, default=2)
 21 |     parser.add_argument(
 22 |         "--num_gen",
 23 |         type=int,
 24 |         default=10,
 25 |         help="number of solutions generated per problem",
 26 |     )
 27 |     parser.add_argument("--code_type", type=str, default="sc")
 28 |     parser.add_argument(
 29 |         "--temperature",
 30 |         type=float,
 31 |         default=0.1,
 32 |         help="0 means greedy decoding for vllm",
 33 |     )
 34 |     parser.add_argument("--k", type=int, default=1, help="k of pass@k")
 35 |     parser.add_argument(
 36 |         "--modify",
 37 |         type=str,
 38 |         default="original",
 39 |         help="modification method of the demonstration code",
 40 |     )
 41 |     parser.add_argument(
 42 |         "--level", type=str, default="all", help="level of the evaluation"
 43 |     )
 44 | 
 45 |     args = parser.parse_args()
 46 | 
 47 |     base_directory = os.path.dirname(__file__)
 48 |     file_name = f"{args.model.replace('/', '-')}_{args.code_type}_{args.modify}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
 49 |     
 50 |     data = read_jsonl_to_dict(os.path.join(base_directory, "result", file_name))
 51 |     data = Dataset.from_list(data)
 52 | 
 53 |     if not os.path.exists(
 54 |         os.path.join(base_directory, "tf", file_name.replace("result.jsonl", "tf.json"))
 55 |     ):
 56 |         eval_apps = apps_metric()
 57 |         results, metrics = eval_apps._compute(
 58 |             data,
 59 |             k_list=[1, 5],
 60 |             level=args.level,
 61 |             split="test",
 62 |             column_name="extracted_solutions",
 63 |         )
 64 |         json.dump(
 65 |             results,
 66 |             open(
 67 |                 os.path.join(
 68 |                     base_directory, "tf", file_name.replace("result.jsonl", "tf.json")
 69 |                 ),
 70 |                 "w",
 71 |             ),
 72 |         )
 73 |     else:
 74 |         results = json.load(open(os.path.join(base_directory, "tf", file_name.replace("result.jsonl", "tf.json")),"r"))
 75 |         print("\n\n\nResults: pass@k on all level")
 76 |         get_results(
 77 |             data,
 78 |             k_list=[1, 5],
 79 |         )
 80 | 
 81 |         
 82 |     results_list = [results[index] for index in results]
 83 |     passed_list = []
 84 |     for results in results_list:
 85 |         for result in results:
 86 |             passed = []
 87 |             for element in result:
 88 |                 passed.append([int(element)])
 89 |         passed_list.append(passed)
 90 |     data = data.add_column("passed", passed_list)
 91 |     for difficulty in ["introductory", "interview", "competition"]:
 92 |         print(f"\n\n\nResults: pass@k on {difficulty} level")
 93 |         get_results(
 94 |             data.filter(lambda x: x["difficulty"] == difficulty)["passed"],
 95 |             k_list=[1, 5],
 96 |         )
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/codecontests/ft.sh:
--------------------------------------------------------------------------------
  1 | for degree in high; do
  2 |     python my_run_clm.py \
  3 |     --model_name_or_path deepseek-ai/deepseek-coder-6.7b-base \
  4 |     --train_file data/ft_final/my_code_contests_train_${degree}.jsonl \
  5 |     --validation_file data/ft_final/my_code_contests_valid_${degree}.jsonl \
  6 |     --output_dir tmp/deepseek/${degree} \
  7 |     --save_steps 100 \
  8 |     --logging_steps 30 \
  9 |     --evaluation_strategy steps \
 10 |     --max_eval_samples 50 \
 11 |     --torch_dtype bfloat16 \
 12 |     --block_size 2048 \
 13 |     --preprocessing_num_workers 8 \
 14 |     --trust_remote_code 1 \
 15 |     --do_train \
 16 |     --do_eval \
 17 |     --learning_rate 5e-5 \
 18 |     --num_train_epochs 1 \
 19 |     --per_device_train_batch_size 4 \
 20 |     --per_device_eval_batch_size 4 \
 21 |     --gradient_accumulation_steps 16 \
 22 |     --lr_scheduler_type cosine \
 23 |     --warmup_ratio 0.01 \
 24 |     --low_cpu_mem_usage True \
 25 |     --overwrite_output_dir True \
 26 |     --report_to wandb \
 27 |     --run_name deepseekcoder-7b-${degree}-mod \
 28 |     --resume_from_checkpoint /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/deepseek/high/checkpoint-200
 29 | done
 30 | 
 31 | # --max_train_samples 50 \
 32 | 
 33 | 
 34 | # # start from checkpoint
 35 | # degree=low
 36 | # python my_run_clm.py \
 37 | #     --model_name_or_path meta-llama/CodeLlama-7b-hf \
 38 | #     --train_file data/ft_final/my_code_contests_train_${degree}.jsonl \
 39 | #     --validation_file data/ft_final/my_code_contests_valid_${degree}.jsonl \
 40 | #     --output_dir tmp/CodeLlama \
 41 | #     --save_steps 5 \
 42 | #     --evaluation_strategy steps \
 43 | #     --max_train_samples 10 \
 44 | #     --torch_dtype bfloat16 \
 45 | #     --block_size 2048 \
 46 | #     --preprocessing_num_workers 8 \
 47 | #     --trust_remote_code 1 \
 48 | #     --do_train \
 49 | #     --do_eval \
 50 | #     --learning_rate 5e-5 \
 51 | #     --num_train_epochs 1 \
 52 | #     --per_device_train_batch_size 1 \
 53 | #     --per_device_eval_batch_size 1 \
 54 | #     --gradient_accumulation_steps 1 \
 55 | #     --lr_scheduler_type cosine \
 56 | #     --warmup_ratio 0.01 \
 57 | #     --low_cpu_mem_usage True \
 58 | #     --overwrite_output_dir True \
 59 | #     --report_to wandb \
 60 | #     --run_name codellama-7b-${degree}-mod \
 61 | #     --resume_from_checkpoint /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/CodeLlama/checkpoint-5 \
 62 | 
 63 |     
 64 | 
 65 | # --max_train_samples 50 \
 66 | 
 67 | 
 68 | # python my_run_clm.py \
 69 | #     --model_name_or_path meta-llama/CodeLlama-7b-hf \
 70 | #     --train_file data/ft_final/my_code_contests_train_low.jsonl \
 71 | #     --validation_file data/ft_final/my_code_contests_valid_low.jsonl \
 72 | #     --output_dir tmp/CodeLlama \
 73 | #     --save_steps 60 \
 74 | #     --evaluation_strategy steps \
 75 | #     --max_train_samples 50 \
 76 | #     --max_eval_samples 1 \
 77 | #     --torch_dtype bfloat16 \
 78 | #     --block_size 2048 \
 79 | #     --preprocessing_num_workers 8 \
 80 | #     --trust_remote_code 1 \
 81 | #     --do_train \
 82 | #     --do_eval \
 83 | #     --learning_rate 5e-5 \
 84 | #     --num_train_epochs 2 \
 85 | #     --per_device_train_batch_size 1 \
 86 | #     --per_device_eval_batch_size 1 \
 87 | #     --gradient_accumulation_steps 1 \
 88 | #     --lr_scheduler_type cosine \
 89 | #     --warmup_ratio 0.01 \
 90 | #     --low_cpu_mem_usage True \
 91 | #     --overwrite_output_dir True \
 92 | #     --report_to wandb \
 93 | #     --run_name codellama-7b-low-mod \
 94 |     
 95 | 
 96 |     
 97 | 
 98 | 
 99 | # # --max_eval_samples 50 \
100 | # # --logging_steps 20 \
101 | 
102 | 


--------------------------------------------------------------------------------
/apps/eval/apps_metric.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Evaluation of code generation on the APPS benchmark"""
15 | 
16 | import evaluate
17 | import datasets
18 | from .utils import compute_metrics
19 | from .testing_util import run_test
20 | 
21 | 
22 | _CITATION = """\
23 | @article{hendrycksapps2021,
24 |   title={Measuring Coding Challenge Competence With APPS},
25 |   author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
26 |   journal={NeurIPS},
27 |   year={2021}
28 | }
29 | """
30 | 
31 | 
32 | _DESCRIPTION = """\
33 | This is a metric to evaluate code generation using the APPS benchmark "Measuring Coding Challenge Competence With
34 | APPS" (https://arxiv.org/pdf/2105.09938.pdf).
35 | """
36 | 
37 | 
38 | # TODO: Add description of the arguments of the module here
39 | _KWARGS_DESCRIPTION = """
40 | Computes Average accuracy and strict accuracy for single generations, and pass@k for multiple generations.
41 | Args:
42 |     predictions: list of code generations to score. It's a list of list(s), each corresponding to a problem from APPS dataset.
43 | 
44 | Returns:
45 |     metrics: dict of three metrics: average accuracy, stric accuracy, and pass@k.
46 | Examples:
47 |     >>> my_new_module = evaluate.load("loubnabnl/apps_metric")
48 |     >>> results = my_new_module.compute(predictions=[["s=input()\nprint(s)"]])
49 |     >>> print(results)
50 |     {'avg_accuracy': 0, 'strict_accuracy': 0, 'pass_at_k': None}
51 | """
52 | 
53 | 
54 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
55 | class apps_metric(evaluate.EvaluationModule):
56 |     """Evaluate code generation on APPS benchmark.
57 |     The generations are compiled and their corresponding unit tests are run"""
58 | 
59 |     def _info(self):
60 |         return evaluate.EvaluationModuleInfo(
61 |             module_type="metric",
62 |             description=_DESCRIPTION,
63 |             citation=_CITATION,
64 |             inputs_description=_KWARGS_DESCRIPTION,
65 |             features=datasets.Features(
66 |                 {
67 |                     "predictions": datasets.Sequence(datasets.Value("string")),
68 |                 }
69 |             ),
70 |             homepage="https://github.com/hendrycks/apps",
71 |             reference_urls=["https://huggingface.co/datasets/codeparrot/apps"],
72 |         )
73 | 
74 |     def _compute(
75 |         self,
76 |         data,
77 |         k_list=[1, 10, 100],
78 |         count_errors=True,
79 |         level="all",
80 |         debug=False,
81 |         split="test",
82 |         column_name="extracted_solutions",
83 |     ):
84 |         """Returns the scores"""
85 |         results, metrics = compute_metrics(
86 |             data,
87 |             k_list=k_list,
88 |             count_errors=count_errors,
89 |             level=level,
90 |             debug=debug,
91 |             split=split,
92 |             column_name=column_name,
93 |         )
94 |         return results, metrics
95 | 


--------------------------------------------------------------------------------
/apps/sc2tmc.py:
--------------------------------------------------------------------------------
  1 | from openai import OpenAI
  2 | from datasets import Dataset
  3 | from utils import *
  4 | from eval.apps_metric import apps_metric
  5 | 
  6 | from filter import *
  7 | 
  8 | 
  9 | def mc_transform(question, sc):
 10 |     client = OpenAI(api_key=your_key)
 11 |     try:
 12 |         messages = [
 13 |             {
 14 |                 "role": "system",
 15 |                 "content": "You are an AI programming assistant.",
 16 |             },
 17 |             {
 18 |                 "role": "user",
 19 |                 "content": f"""QUESTION:
 20 | {question}
 21 | 
 22 | ANSWER:
 23 | ```python
 24 | {sc}
 25 | ```
 26 | Refactor the above program. Follow the guidelines
 27 | * make the program more modular with smaller and meaningful helper functions
 28 | * good descriptive names for the helper functions
 29 | * have an entry function called 'main()'
 30 | * 'main()' is called inside 'if __name__ == '__main__''
 31 | 
 32 | Do not change the original semantics of the program significantly and no need to perform optimizations. Enclose the program within backticks as shown above.""",
 33 |             },
 34 |         ]
 35 | 
 36 |         completion = client.chat.completions.create(
 37 |             model="gpt-3.5-turbo",
 38 |             messages=messages,
 39 |             max_tokens=1024,
 40 |             stop=["\n\n\n\n", "QUESTION:", "ANSWER:"],
 41 |             temperature=0.6,
 42 |             n=20,
 43 |         )
 44 | 
 45 |         response = []
 46 |         for choice in completion.choices:
 47 |             content = choice.message.content
 48 |             response.append(extract_solution(content))
 49 |         return response
 50 | 
 51 |     except:
 52 |         return None
 53 | 
 54 | 
 55 | def extract_solution(code):
 56 |     start_index = code.find("```python")
 57 |     if start_index == -1:
 58 |         solution = code
 59 |     else:
 60 |         end_index = code.find("```", start_index + len("```python"))
 61 |         if start_index < end_index:
 62 |             solution = code[start_index + len("```python") : end_index]
 63 |         else:
 64 |             solution = code[start_index + len("```python") :]
 65 |     return solution
 66 | 
 67 | 
 68 | def main():
 69 |     eval_apps = apps_metric()
 70 |     for seed in [27, 42, 101, 134, 169]:
 71 |         data = Dataset.from_json(f"data/2shot_demonstration_{seed}seed.json")
 72 |         dataset = data.map(
 73 |             lambda x: {"tmc": mc_transform(x["problem_description"], x["sc"])}
 74 |         )
 75 |         results, _ = eval_apps._compute(
 76 |             dataset, k_list=[1], split="train", column_name="tmc"
 77 |         )
 78 |         transformed_mc = []
 79 |         for index in results:
 80 |             passed_code = []
 81 |             for i, result in enumerate(results[index]):
 82 |                 code = dataset["tmc"][int(index)][i]
 83 |                 print(code)
 84 |                 if all(x == True for x in result):
 85 |                     visit = cc_visit(code)
 86 |                     count = [
 87 |                         count_module_written(code, func.name)
 88 |                         for func in visit.functions
 89 |                     ]
 90 |                     TF = all(x >= 2 for x in count)
 91 |                     if len(count) >= 3 and TF:
 92 |                         passed_code.append([code])
 93 |                         break
 94 |             if not len(passed_code) > 0:
 95 |                 # raise ValueError("No code passed the criteria")
 96 |                 break
 97 |             else:
 98 |                 transformed_mc.append(passed_code[0])
 99 |         if len(transformed_mc) == len(dataset):
100 |             dataset = dataset.remove_columns(["tmc"])
101 |             dataset = dataset.add_column("transformed_mc", transformed_mc)
102 |             dataset.to_json(
103 |                 f"data/2shot_demonstration_{seed}seed.json"
104 |             )
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/codecontests/preprocess_original_dataset_icl.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | from datasets import load_dataset
  4 | from utils.utils_evaluate import safe_eval_answer_from_agent
  5 | from radon.complexity import cc_visit
  6 | 
  7 | 
  8 | # delete all solutions in another language except python in the dataset
  9 | def leave_python_solution(example):
 10 |     solutions = example['solutions']['solution']
 11 |     language_index = example['solutions']['language']
 12 | 
 13 |     python_solution = []
 14 |     for i, lang in enumerate(language_index):
 15 |         if lang == 3: # python3
 16 |             python_solution.append(solutions[i])
 17 | 
 18 |     example['solutions']['solution'] = python_solution
 19 |     del example['solutions']['language']
 20 |     return example
 21 | 
 22 | 
 23 | # remove annotated parts in the code
 24 | def remove_annotation(example):
 25 |     def remove_annotation_(input_string):
 26 |         modified_string = re.sub(r"#.*?(?=\n)", '', input_string)
 27 |         modified_string = re.sub(r"'''.*?'''", '', modified_string, flags=re.DOTALL)
 28 |         modified_string = re.sub(r'""".*?"""', '', modified_string, flags=re.DOTALL)
 29 |         return modified_string
 30 | 
 31 |     for i in range(len(example['solutions']['solution'])):
 32 |         example['solutions']['solution'][i] = remove_annotation_(example['solutions']['solution'][i])
 33 | 
 34 |     return example
 35 | 
 36 | 
 37 | # calculate cc and module list of code and add them to dataset
 38 | def add_cc_and_modules(example):
 39 |     ccs = []
 40 |     modules = []
 41 | 
 42 |     for code in example['solutions']['solution']:
 43 |         cc, module_name = get_avg_cc_and_module(code)
 44 |         ccs.append(cc)
 45 |         modules.append(module_name)
 46 | 
 47 |     example['solutions']['cc'] = ccs
 48 |     example['solutions']['modules'] = modules
 49 | 
 50 |     return example
 51 | 
 52 | 
 53 | # calculate average cc of each solution code and add it
 54 | def get_avg_cc_and_module(code):
 55 |     try:
 56 |         module_name = []
 57 |         visitor = cc_visit(code)
 58 | 
 59 |         # 1. average cc of modules
 60 |         total_module_complexity = 0
 61 |         num_module = 0
 62 |         for module in visitor.blocks:
 63 |             # only consider function or method of class as module
 64 |             if module.__class__.__name__ == 'Function': 
 65 |                 module_name.append(module.name)
 66 |                 total_module_complexity += module.complexity
 67 |                 num_module += 1
 68 | 
 69 |         # 2. cc of body code
 70 |         body_complexity = visitor.complexity
 71 | 
 72 |         # 3. average cc of the program
 73 |         avg_cc = (total_module_complexity + body_complexity) / (num_module + 1)
 74 |     except:
 75 |         # cc_visit fails to return because the input code has some errors
 76 |         avg_cc = 0
 77 |         module_name = []
 78 |     
 79 |     return avg_cc, module_name
 80 | 
 81 | 
 82 | def start(split):
 83 |     base_dir = os.path.dirname(__file__)
 84 |     
 85 |     # load original dataset
 86 |     dataset = load_dataset("deepmind/code_contests", cache_dir='/data/huggingface/datasets')
 87 |     dataset = dataset[split]
 88 |     # 1. filter questions without any python solution
 89 |     dataset = dataset.filter(lambda example: 3 in example['solutions']['language'])
 90 |     # 2. retain only python solutions in problem
 91 |     dataset = dataset.map(leave_python_solution, num_proc=2)
 92 |     # 3: mark each python solution passed or not by running the test cases
 93 |     dataset = dataset.map(safe_eval_answer_from_agent, num_proc=1)
 94 |     # 4. remove annotation parts of code
 95 |     dataset = dataset.map(remove_annotation)
 96 |     # 5. add cc and modules names contained in the code to the dataset
 97 |     dataset = dataset.map(add_cc_and_modules, num_proc=16)
 98 |     # 6. save
 99 |     dataset.to_json(os.path.join(base_dir, 'data', f'my_code_contests_{split}.jsonl'))
100 | 
101 | 
102 | start('test')
103 | start('valid')
104 | start('train')


--------------------------------------------------------------------------------
/codecontests/ppl.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from transformers import AutoTokenizer, AutoModelForCausalLM
  3 | from utils.utils import read_jsonl_to_dict
  4 | from tqdm import tqdm
  5 | import argparse
  6 | import random
  7 | import numpy as np
  8 | import os
  9 | 
 10 | 
 11 | def set_seed(seed):
 12 |     random.seed(seed)
 13 |     np.random.seed(seed)
 14 |     torch.manual_seed(seed)
 15 |     torch.cuda.manual_seed(seed)
 16 |     # When running on the CuDNN backend, two further options must be set
 17 |     torch.backends.cudnn.deterministic = True
 18 |     torch.backends.cudnn.benchmark = False
 19 |     # Set a fixed value for the hash seed
 20 |     os.environ["PYTHONHASHSEED"] = str(seed)
 21 |     
 22 |     
 23 | set_seed(42)
 24 |     
 25 | parser = argparse.ArgumentParser()
 26 | parser.add_argument("--gpu", type=int, required=True, default=0)
 27 | parser.add_argument("--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf")
 28 | parser.add_argument("--include_prompt", action='store_true')
 29 | parser.add_argument("--mod", type=str, required=True)
 30 | args = parser.parse_args()
 31 |     
 32 | device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
 33 | tokenizer = AutoTokenizer.from_pretrained(args.model)
 34 | 
 35 | if 'CodeLlama' in args.model:
 36 |     dtype = torch.float16
 37 | elif 'deepseek' in args.model:
 38 |     dtype = torch.bfloat16
 39 |     
 40 | model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=dtype)
 41 | model = model.to(device)
 42 | tokenizer.pad_token = tokenizer.eos_token
 43 | 
 44 | path = f'/home/kdy20401/Workspace/Proj-Code-Generation/MC/data/my_code_contests_train_{args.mod}_mod.jsonl'
 45 | dataset = read_jsonl_to_dict(path)
 46 | 
 47 | losses = []
 48 | perplexity = []
 49 | # length = []
 50 | problems = []
 51 | for j, data in enumerate(dataset):
 52 |     description = data['description']
 53 |     code = data['code']
 54 |     
 55 |     if args.include_prompt == True:
 56 |         instruction = (
 57 |             "Write a python code to solve the following coding problem "
 58 |             "that obeys the constraints and passes the example test cases. "
 59 |             "The output code needs to read from and write to standard IO. "
 60 |             "Please wrap your code answer using ```:"
 61 |         )
 62 |         if 'CodeLlama' in args.model:
 63 |             prefix = ""
 64 |             prefix += "Q: " + instruction + "\n"
 65 |             prefix += description + "\n"
 66 |             prefix += "A: "
 67 |         elif 'deepseek' in args.model:
 68 |             prefix = ""
 69 |             prefix += instruction + '\n'
 70 |             prefix += "### Instruction:\n" + description + "\n"
 71 |             prefix += "### Response:\n"
 72 |             
 73 |         prompt = prefix + code
 74 |         all_tokens = tokenizer(prompt, return_tensors="pt", max_length=8192, truncation=True).to(device)
 75 |         prefix_tokens = tokenizer(prefix, return_tensors="pt", max_length=8192, truncation=True).to(device)
 76 |         code_start_index = len(prefix_tokens['input_ids'][0])
 77 |         labels = all_tokens['input_ids'].clone()
 78 |         labels[:, :code_start_index] = -100 # ignore loss of prefix
 79 |     else:
 80 |         prompt = code
 81 |         all_tokens = tokenizer(prompt, return_tensors="pt", max_length=8192, truncation=True).to(device)
 82 |         labels = all_tokens['input_ids']
 83 |         
 84 |     # problem
 85 |     problems.append(data['name'])
 86 |     with torch.no_grad():
 87 |         outputs = model(all_tokens['input_ids'], labels=labels)
 88 |         loss = outputs.loss
 89 |         # loss 
 90 |         losses.append(loss)
 91 |         
 92 |         ppl = torch.exp(outputs.loss).item()
 93 |         if ppl != torch.nan:
 94 |             perplexity.append(ppl)
 95 |         else:
 96 |             print('nan!')
 97 | 
 98 | 
 99 | # print(min(length), max(length))
100 | print(f'model: {args.model}')
101 | print(f'dataset of {args.mod} modularity')
102 | # print(f'average nll: {torch.stack(losses).mean()}')
103 | print(f'average ppl: {sum(perplexity) / len(perplexity)}')
104 | 


--------------------------------------------------------------------------------
/codecontests/evaluate_.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import multiprocessing
  4 | import time
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official
  9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl
 10 | 
 11 | from datasets import load_dataset
 12 | 
 13 | from scipy import stats
 14 | 
 15 | 
 16 | def _temp_run(code, tests, passed):
 17 |     try:
 18 |         flag, _ = verify_code_official(tests, code)
 19 |         passed.append(flag)
 20 |     except Exception as e:
 21 |         pass
 22 | 
 23 | 
 24 | def main():
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument("--seed", type=int, required=True, default=0)
 27 |     parser.add_argument(
 28 |         "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf"
 29 |     )
 30 |     parser.add_argument("--num_icl_shot", type=int, required=True, default=2)
 31 |     parser.add_argument(
 32 |         "--num_gen",
 33 |         type=int,
 34 |         required=True,
 35 |         default=1,
 36 |         help="number of solutions generated per problem",
 37 |     )
 38 |     parser.add_argument(
 39 |         "--temperature",
 40 |         type=float,
 41 |         default=0,
 42 |         required=True,
 43 |         help="0 means greedy decoding for vllm",
 44 |     )
 45 |     parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k")
 46 |     
 47 |     args = parser.parse_args()
 48 | 
 49 |     base_directory = os.path.dirname(__file__)
 50 |     test_dataset = load_dataset(
 51 |         "deepmind/code_contests", split="test",
 52 |     )
 53 |     
 54 |     result_file = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
 55 |     
 56 |     if not os.path.exists(os.path.join(base_directory, "result", result_file)):
 57 |         return
 58 |     
 59 |     if os.path.exists(os.path.join(base_directory, "result", "2shot_mc", result_file)):
 60 |         return
 61 |     
 62 |     result_data = read_jsonl_to_dict(os.path.join(base_directory, "result", result_file))
 63 |     assert len(result_data) == 165
 64 |     
 65 |     start = time.time()
 66 |     passed_results = []
 67 |     for i, data in enumerate(result_data):
 68 |         # make test cases for each problem
 69 |         tests = {"inputs": [], "outputs": []}
 70 |         tests["inputs"].extend(data["public_tests"]["input"])
 71 |         tests["inputs"].extend(data["private_tests"]["input"])
 72 |         tests["outputs"].extend(data["public_tests"]["output"])
 73 |         tests["outputs"].extend(data["private_tests"]["output"])
 74 |         assert len(tests["inputs"]) == len(tests["outputs"])
 75 |         
 76 |         time_limit = test_dataset[i]["time_limit"]["seconds"]
 77 |         passed = []
 78 |         for code in data["extracted_solutions"]:
 79 |             manager = multiprocessing.Manager()
 80 |             manager_list = manager.list()
 81 |             p = multiprocessing.Process(
 82 |                 target=_temp_run, args=(code, tests, manager_list)
 83 |             )
 84 |             p.start()
 85 |             p.join(timeout=time_limit + 1)
 86 |             
 87 |             if p.is_alive():
 88 |                 p.kill()
 89 |             if not manager_list:
 90 |                 passed.append(0)
 91 |             else:
 92 |                 if manager_list[0] == True:
 93 |                     passed.append(1)
 94 |                 else:
 95 |                     passed.append(0)
 96 |                     
 97 |         result_data[i]["passed"] = passed  # new data
 98 |         passed_results.append(passed)
 99 |         
100 |     print(f"time: {time.time() - start:.2f}s")
101 |     ks = [args.k]
102 |     performance = compute_pass_at_ks(passed_results, ks)
103 |     print(f"pass@{ks[0]}: {performance}")
104 |     # statistics for one dot in the correlation figure
105 |     # add pass information to result_data and save
106 |     write_dict_to_jsonl(result_data, os.path.join(base_directory, "result", "2shot_mc", result_file))
107 |     print(f'{result_file} saved.')
108 |     
109 |     print('program ends.')
110 |     
111 | 
112 |         
113 | 
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     main()
118 | 


--------------------------------------------------------------------------------
/codecontests/evaluate_gpt.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import multiprocessing
  4 | import time
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official
  9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl
 10 | 
 11 | from datasets import load_dataset
 12 | 
 13 | from scipy import stats
 14 | 
 15 | 
 16 | def _temp_run(code, tests, passed):
 17 |     try:
 18 |         flag, _ = verify_code_official(tests, code)
 19 |         passed.append(flag)
 20 |     except Exception as e:
 21 |         pass
 22 | 
 23 | 
 24 | def main():
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument("--seed", type=int, required=True, default=0)
 27 |     parser.add_argument(
 28 |         "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf"
 29 |     )
 30 |     parser.add_argument("--num_icl_shot", type=int, required=True, default=2)
 31 |     parser.add_argument(
 32 |         "--num_gen",
 33 |         type=int,
 34 |         required=True,
 35 |         default=1,
 36 |         help="number of solutions generated per problem",
 37 |     )
 38 |     parser.add_argument(
 39 |         "--temperature",
 40 |         type=float,
 41 |         default=0,
 42 |         required=True,
 43 |         help="0 means greedy decoding for vllm",
 44 |     )
 45 |     parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k")
 46 |     parser.add_argument("--code_type", type=str, required=True)
 47 |     
 48 |     args = parser.parse_args()
 49 | 
 50 |     base_directory = os.path.dirname(__file__)
 51 |     test_dataset = load_dataset(
 52 |         "deepmind/code_contests", split="test",
 53 |     )
 54 |     
 55 |     result_file = f"{args.model}_{args.code_type}_code_{args.num_icl_shot}shot_{args.num_gen}gen_{args.seed}seed_icl_result.jsonl"
 56 |     
 57 |     if not os.path.exists(os.path.join(base_directory, "result/gpt", result_file)):
 58 |         print('result file does not exist')
 59 |         return
 60 |     
 61 |     if os.path.exists(os.path.join(base_directory, "result/gpt/result", result_file)):
 62 |         print('result file already exists')
 63 |         return
 64 |     
 65 |     result_data = read_jsonl_to_dict(os.path.join(base_directory, "result/gpt", result_file))
 66 |     print(f'result file path:')
 67 |     print(os.path.join(base_directory, "result/gpt/", result_file))
 68 |     
 69 |     start = time.time()
 70 |     passed_results = []
 71 |     for i, data in enumerate(result_data):
 72 |         # make test cases for each problem
 73 |         tests = {"inputs": [], "outputs": []}
 74 |         tests["inputs"].extend(data["public_tests"]["input"])
 75 |         tests["inputs"].extend(data["private_tests"]["input"])
 76 |         tests["outputs"].extend(data["public_tests"]["output"])
 77 |         tests["outputs"].extend(data["private_tests"]["output"])
 78 |         assert len(tests["inputs"]) == len(tests["outputs"])
 79 |         
 80 |         time_limit = test_dataset[i]["time_limit"]["seconds"]
 81 |         passed = []
 82 |         for code in data["extracted_solutions"]:
 83 |             manager = multiprocessing.Manager()
 84 |             manager_list = manager.list()
 85 |             p = multiprocessing.Process(
 86 |                 target=_temp_run, args=(code, tests, manager_list)
 87 |             )
 88 |             p.start()
 89 |             p.join(timeout=time_limit + 1)
 90 |             
 91 |             if p.is_alive():
 92 |                 p.kill()
 93 |             if not manager_list:
 94 |                 passed.append(0)
 95 |             else:
 96 |                 if manager_list[0] == True:
 97 |                     passed.append(1)
 98 |                 else:
 99 |                     passed.append(0)
100 |                     
101 |         result_data[i]["passed"] = passed  # new data
102 |         passed_results.append(passed)
103 |         
104 |     # print(f"time: {time.time() - start:.2f}s")
105 |     ks = [args.k]
106 |     performance = compute_pass_at_ks(passed_results, ks)
107 |     print(f"pass@{ks[0]}: {performance}")
108 |     # statistics for one dot in the correlation figure
109 |     # add pass information to result_data and save
110 |     write_dict_to_jsonl(result_data, os.path.join(base_directory, "result/gpt/result", result_file))
111 |     # print(f'{result_file} saved.')
112 |     
113 |     # print('program ends.')
114 |     
115 | 
116 |         
117 | 
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/codecontests/utils/utils_evaluate.py:
--------------------------------------------------------------------------------
  1 | # import gzip
  2 | import io
  3 | import itertools
  4 | import json
  5 | # import pprint
  6 | import numpy as np
  7 | # import re
  8 | # import sys
  9 | # import timeout_decorator
 10 | # import traceback
 11 | # from collections import Counter 
 12 | # from io import StringIO
 13 | import sys
 14 | # from collections import defaultdict
 15 | # from datasets import concatenate_datasets, load_dataset
 16 | # from multiprocessing import Process, Queue
 17 | import multiprocessing
 18 | # from tqdm import tqdm
 19 | from typing import Dict, List, Union
 20 | # import os 
 21 | # import ast 
 22 | # import random 
 23 | # import subprocess
 24 | # import tempfile, shutil, os
 25 | # from pyext import RuntimeModule
 26 | from copy import deepcopy, copy 
 27 | # from functools import wraps
 28 | import time
 29 | import contextlib
 30 | import pdb 
 31 | 
 32 | from utils.utils_execute import run_test
 33 | 
 34 | GLOBAL_TIMEOUT = 10 # TIMEOUT for one solution
 35 | 
 36 | 
 37 | def safe_eval_answer_from_agent(example):
 38 |     def _temp_run(code, tests, result):
 39 |         try:
 40 |             flag, outcomes = verify_code_official(tests, code)
 41 |             result.append(flag)
 42 |         except Exception as e:
 43 |             pass
 44 |     
 45 |     tests = {'inputs': [], 'outputs': []}
 46 |     tests['inputs'].extend(example['public_tests']['input'])
 47 |     tests['inputs'].extend(example['private_tests']['input'])
 48 |     tests['outputs'].extend(example['public_tests']['output'])
 49 |     tests['outputs'].extend(example['private_tests']['output'])
 50 |     passed = []
 51 |     
 52 |     for code in example['solutions']['solution']:
 53 |         manager = multiprocessing.Manager()
 54 |         result = manager.list()
 55 |         p = multiprocessing.Process(target=_temp_run, args=(code, tests, result))
 56 |         p.start()
 57 |         p.join(timeout=GLOBAL_TIMEOUT + 1)
 58 |         if p.is_alive():
 59 |             p.kill()
 60 |         if not result:
 61 |             result = [-1]
 62 | 
 63 |         if result[0] == True:
 64 |             passed.append(True)
 65 |         else:
 66 |             passed.append(False)
 67 |     
 68 |     example['solutions']['passed'] = passed
 69 |     return example
 70 | 
 71 | def verify_code_official(tests, solution, debug=False, return_output=False):
 72 |     ''' verify if code passes all tests, using apps official implementation (https://github.com/hendrycks/apps/blob/main/eval/testing_util.py#L122)
 73 |     '''
 74 |     tests = deepcopy(tests)
 75 |     # suppress the stdout of solution execution
 76 |     # todo: suppress stderr as well
 77 |     with contextlib.redirect_stdout(io.StringIO()):
 78 |         results = run_test(tests, solution, debug=debug, return_output=return_output)
 79 |         if return_output:
 80 |             tmp = results
 81 |             all_outputs = results[1]
 82 |             results = results[0]
 83 |     if all([res == True for res in results]):
 84 |         if return_output:
 85 |             return True, results, all_outputs
 86 |         return True, results
 87 |     else:
 88 |         if return_output:
 89 |             return False, results, all_outputs
 90 |         return False, results
 91 |     
 92 | def estimate_pass_at_k(
 93 |     num_samples: Union[int, List[int], np.ndarray],
 94 |     num_correct: Union[List[int], np.ndarray],
 95 |     k: int,
 96 | ) -> np.ndarray:
 97 |     """
 98 |     Estimates pass@k of each problem and returns them in an array.
 99 |     Taken from https://github.com/openai/human-eval/blob/master/human_eval/evaluation.py#L13.
100 |     """
101 |     def estimator(n: int, c: int, k: int) -> float:
102 |         """
103 |         Calculates 1 - comb(n - c, k) / comb(n, k).
104 |         """
105 |         if n - c < k:
106 |             return 1.0
107 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
108 | 
109 |     if isinstance(num_samples, int):
110 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
111 |     else:
112 |         assert len(num_samples) == len(num_correct)
113 |         num_samples_it = iter(num_samples)
114 | 
115 |     return np.array(
116 |         [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
117 |     )
118 | 
119 | def compute_pass_at_ks(results, ks):
120 |     output = {
121 |         k: estimate_pass_at_k(
122 |             [len(x) for x in results],
123 |             [sum([i == True for i in x]) for x in results],
124 |             k,
125 |         ).mean()
126 |         for k in ks
127 |     }
128 |     return output


--------------------------------------------------------------------------------
/apps/icl.sh:
--------------------------------------------------------------------------------
  1 | #/bin/bash
  2 | 
  3 | num_icl_shot=2
  4 | code_type = $2
  5 | 
  6 | if [ $1 == deepseek ]; then
  7 |     model=deepseek-ai/deepseek-coder-6.7b-base
  8 | else
  9 |     model=meta-llama/CodeLlama-7b-hf
 10 | fi
 11 | 
 12 | task0() {
 13 |     local seed=$1
 14 |     llama_size=7
 15 |     llama_model=meta-llama/CodeLlama-${llama_size}b-hf
 16 |     CUDA_VISIBLE_DEVICES=0,1 python -u icl.py \
 17 |     --seed $seed --model ${llama_model} \
 18 |     --num_gpu ${num_gpu} --dtype float16 --num_icl_shot ${num_icl_shot} \
 19 |     --num_gen 10 --code_type mc \
 20 |     --temperature ${temperature} --max_new_token 1024 \
 21 |     --top_p 0.95 --modify original --swap_space ${swap_space} \
 22 |     > log/codellama_${llama_size}b_${num_icl_shot}shot_${temperature}temp_mc_${seed}.log 2>&1
 23 |     task1_completed $seed
 24 | }
 25 | 
 26 | task1() {
 27 |     local seed=$1
 28 |     llama_size=7
 29 |     llama_model=meta-llama/CodeLlama-${llama_size}b-hf
 30 |     CUDA_VISIBLE_DEVICES=0,1 python -u icl.py \
 31 |     --seed $seed --model ${llama_model} \
 32 |     --num_gpu ${num_gpu} --dtype float16 --num_icl_shot ${num_icl_shot} \
 33 |     --num_gen 10 --code_type mc \
 34 |     --temperature ${temperature} --max_new_token 1024 \
 35 |     --top_p 0.95 --modify original --swap_space ${swap_space} \
 36 |     > log/codellama_${llama_size}b_${num_icl_shot}shot_${temperature}temp_mc_${seed}.log 2>&1
 37 |     task1_completed $seed
 38 | }
 39 | task2() {
 40 |     local seed=$1
 41 |     deepseek_size=6.7
 42 |     deepseek_model=deepseek-ai/deepseek-coder-${deepseek_size}b-base
 43 |     CUDA_VISIBLE_DEVICES=0,1 python -u icl.py \
 44 |     --seed $seed --model ${deepseek_model} \
 45 |     --num_gpu ${num_gpu} --dtype bfloat16 --num_icl_shot ${num_icl_shot} \
 46 |     --num_gen 10 --code_type mc \
 47 |     --temperature ${temperature} --max_new_token 1024 \
 48 |     --top_p 0.95 --modify original --swap_space ${swap_space} \
 49 |     > log/deepseek_${deepseek_size}b_${num_icl_shot}shot_${temperature}temp_mc_${seed}.log 2>&1
 50 |     task2_completed $seed
 51 | }
 52 | task3() {
 53 |     local seed=$1
 54 |     llama_size=7
 55 |     llama_model=meta-llama/CodeLlama-${llama_size}b-hf
 56 |     CUDA_VISIBLE_DEVICES=2,3 python -u icl.py \
 57 |     --seed $seed --model ${llama_model} \
 58 |     --num_gpu ${num_gpu} --dtype float16 --num_icl_shot ${num_icl_shot} \
 59 |     --num_gen 10 --code_type sc \
 60 |     --temperature ${temperature} --max_new_token 1024 \
 61 |     --top_p 0.95 --modify original --swap_space ${swap_space} \
 62 |     > log/codellama_${llama_size}b_${num_icl_shot}shot_${temperature}temp_sc_${seed}.log 2>&1
 63 |     task3_completed $seed
 64 | }
 65 | 
 66 | task4() {
 67 |     local seed=$1
 68 |     deepseek_size=6.7
 69 |     deepseek_model=deepseek-ai/deepseek-coder-${deepseek_size}b-base
 70 |     CUDA_VISIBLE_DEVICES=3 python -u icl.py \
 71 |     --seed $seed --model ${deepseek_model} \
 72 |     --num_gpu ${num_gpu} --dtype bfloat16 --num_icl_shot ${num_icl_shot} \
 73 |     --num_gen 10 --code_type sc \
 74 |     --temperature ${temperature} --max_new_token 1024 \
 75 |     --top_p 0.95 --modify original --swap_space ${swap_space} \
 76 |     > log/deepseek_${deepseek_size}b_${num_icl_shot}shot_${temperature}temp_sc_${seed}.log 2>&1
 77 |     task4_completed $seed
 78 | }
 79 | task1_completed() {
 80 |     local seed=$1
 81 |     # Start task1 for the next seed
 82 |     next_seed=$(next_seed $seed)
 83 |     if [ -n "$next_seed" ]; then
 84 |         task1 $next_seed &
 85 |     fi
 86 | }
 87 | 
 88 | task2_completed() {
 89 |     local seed=$1
 90 |     # Start task2 for the next seed
 91 |     next_seed=$(next_seed $seed)
 92 |     if [ -n "$next_seed" ]; then
 93 |         task2 $next_seed &
 94 |     fi
 95 | }
 96 | 
 97 | task3_completed() {
 98 |     local seed=$1
 99 |     # Start task1 for the next seed
100 |     next_seed=$(next_seed $seed)
101 |     if [ -n "$next_seed" ]; then
102 |         task3 $next_seed &
103 |     fi
104 | }
105 | 
106 | task4_completed() {
107 |     local seed=$1
108 |     # Start task2 for the next seed
109 |     next_seed=$(next_seed $seed)
110 |     if [ -n "$next_seed" ]; then
111 |         task4 $next_seed &
112 |     fi
113 | }
114 | 
115 | next_seed() {
116 |     local seed=$1
117 |     case $seed in
118 |         27) echo 42 ;;
119 |         42) echo 101 ;;
120 |         101) echo "" ;;
121 |         169) echo "" ;;
122 |     esac
123 | }
124 | 
125 | temperature=0.1
126 | num_gpu=2
127 | swap_space=$((64/num_gpu))
128 | num_icl_shot=2
129 | 
130 | 
131 | # Start the first tasks
132 | task0 134 &
133 | # task1 27 &
134 | # task3 27 &
135 | # task2 27 &
136 | # task4 27 &
137 | 
138 | # Wait for all background jobs to finish
139 | wait


--------------------------------------------------------------------------------
/codecontests/calculate_corr.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Calculate Correlation (execute after evaluation)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# CL 7b, DS 7b modularity\n",
 17 |     "\n",
 18 |     "import os\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "\n",
 21 |     "from utils.utils_evaluate import compute_pass_at_ks, verify_code_official\n",
 22 |     "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_modularity_score\n",
 23 |     "\n",
 24 |     "from scipy import stats\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "models = ['meta-llama-CodeLlama-7b-hf', 'deepseek-ai-deepseek-coder-6.7b-base']\n",
 28 |     "code_element = ['modularity']\n",
 29 |     "\n",
 30 |     "num_code = 100\n",
 31 |     "num_gen = 10\n",
 32 |     "k = 1\n",
 33 |     "base_directory = os.getcwd()\n",
 34 |     "for model in models:\n",
 35 |     "    codes = []\n",
 36 |     "    for element in code_element:\n",
 37 |     "        num_point = 0\n",
 38 |     "        performances = []\n",
 39 |     "        element_values = []\n",
 40 |     "        for code_idx in range(num_code):\n",
 41 |     "            file_name = f'{model}_1shot_10gen_0.1temp_{element}_{code_idx}code_icl_result.jsonl'\n",
 42 |     "            if not os.path.exists(os.path.join(base_directory, \"result\", \"corr_exp_evaluation_result\", file_name)):\n",
 43 |     "                continue\n",
 44 |     "            \n",
 45 |     "            num_point += 1 # number of points in the correlation plot (=number of evaluation result)\n",
 46 |     "            results = read_jsonl_to_dict(os.path.join(base_directory, \"result\", \"corr_exp_evaluation_result\", file_name))\n",
 47 |     "            passed_results = []\n",
 48 |     "            for result in results:\n",
 49 |     "                assert len(result['passed']) == num_gen\n",
 50 |     "                passed_results.append(result['passed'])\n",
 51 |     "\n",
 52 |     "            # code\n",
 53 |     "            codes.append(results[0]['demonstration']['code'][0])\n",
 54 |     "            \n",
 55 |     "            # pass@k\n",
 56 |     "            performances.append(compute_pass_at_ks(passed_results, [k])[k])\n",
 57 |     "            \n",
 58 |     "            # style or modularity\n",
 59 |     "            if element == 'style':\n",
 60 |     "                element_values.append(results[0]['demonstration']['score_style'][0]['score_pep8'])\n",
 61 |     "            elif element == 'modularity':\n",
 62 |     "                element_values.append(results[0]['demonstration']['score_modularity'][0])\n",
 63 |     "        \n",
 64 |     "        # re calculate modularity\n",
 65 |     "        # element_values = []\n",
 66 |     "        # for code in codes:\n",
 67 |     "            # element_values.append(get_code_modularity_score(code))\n",
 68 |     "\n",
 69 |     "        # plt.scatter(element_values, [0.5] * len(element_values), color='blue', label='only mod')\n",
 70 |     "\n",
 71 |     "        # calculate correlation\n",
 72 |     "        pearsonr_stat = stats.pearsonr(element_values, performances)\n",
 73 |     "        pearsonr, pearsonr_p = pearsonr_stat.correlation, pearsonr_stat.pvalue\n",
 74 |     "        spearmanr_stat = stats.spearmanr(element_values, performances)\n",
 75 |     "        spearmanr, spearmanr_p = spearmanr_stat.correlation, spearmanr_stat.pvalue\n",
 76 |     "        \n",
 77 |     "        performances = [performance * 100 for performance in performances] # for better visualization\n",
 78 |     "        plt.scatter(element_values, performances, color='red', label='Sampled Data')\n",
 79 |     "        plt.xlabel(element)\n",
 80 |     "        plt.ylabel('pass@k')\n",
 81 |     "        plt.legend()\n",
 82 |     "        plt.show()\n",
 83 |     " \n",
 84 |     "        print(f'model: {model}')\n",
 85 |     "        print(f'pearsonr: {round(pearsonr, 2)}, pearsonr_p: {round(pearsonr_p, 2)}')\n",
 86 |     "        print(f'spearmanr: {round(spearmanr, 2)}, spearmanr_p: {round(spearmanr_p, 2)}')\n",
 87 |     "        print(f'num data: {num_point}')\n"
 88 |    ]
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "kernelspec": {
 93 |    "display_name": "mcg",
 94 |    "language": "python",
 95 |    "name": "python3"
 96 |   },
 97 |   "language_info": {
 98 |    "codemirror_mode": {
 99 |     "name": "ipython",
100 |     "version": 3
101 |    },
102 |    "file_extension": ".py",
103 |    "mimetype": "text/x-python",
104 |    "name": "python",
105 |    "nbconvert_exporter": "python",
106 |    "pygments_lexer": "ipython3",
107 |    "version": "3.9.19"
108 |   }
109 |  },
110 |  "nbformat": 4,
111 |  "nbformat_minor": 2
112 | }
113 | 


--------------------------------------------------------------------------------
/codecontests/evaluate_ft.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import multiprocessing
  4 | import time
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official
  9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl
 10 | 
 11 | from datasets import load_dataset
 12 | 
 13 | from scipy import stats
 14 | 
 15 | 
 16 | def _temp_run(code, tests, passed):
 17 |     try:
 18 |         flag, _ = verify_code_official(tests, code)
 19 |         passed.append(flag)
 20 |     except Exception as e:
 21 |         pass
 22 | 
 23 | 
 24 | def main():
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument("--seed", type=int, required=True, default=0)
 27 |     parser.add_argument(
 28 |         "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf"
 29 |     )
 30 |     parser.add_argument("--num_icl_shot", type=int, required=True, default=2)
 31 |     parser.add_argument(
 32 |         "--num_gen",
 33 |         type=int,
 34 |         required=True,
 35 |         default=1,
 36 |         help="number of solutions generated per problem",
 37 |     )
 38 |     parser.add_argument(
 39 |         "--temperature",
 40 |         type=float,
 41 |         default=0,
 42 |         required=True,
 43 |         help="0 means greedy decoding for vllm",
 44 |     )
 45 |     parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k")
 46 |     parser.add_argument('--degree', type=str, required=False, default='low')
 47 |     parser.add_argument('--chkpt', type=str, required=False, default=0)
 48 |     
 49 |     args = parser.parse_args()
 50 | 
 51 |     base_directory = os.path.dirname(__file__)
 52 |     test_dataset = load_dataset(
 53 |         "deepmind/code_contests", split="test",
 54 |     )
 55 |     
 56 |     if "CodeLlama" in args.model:
 57 |         result_file = f"CodeLlama_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
 58 |     elif "deepseek" in args.model:
 59 |         result_file = f"DeepSeek_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
 60 |     
 61 |     # result_file = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
 62 |     
 63 |     if not os.path.exists(os.path.join(base_directory, "result/ft", result_file)):
 64 |         print('result file does not exist')
 65 |         return
 66 |     
 67 |     if os.path.exists(os.path.join(base_directory, "result/ft/result", result_file)):
 68 |         print('result file already exists')
 69 |         return
 70 |     
 71 |     result_data = read_jsonl_to_dict(os.path.join(base_directory, "result/ft", result_file))
 72 |     print(f'result file path:')
 73 |     print(os.path.join(base_directory, "result/ft/", result_file))
 74 |     
 75 |     start = time.time()
 76 |     passed_results = []
 77 |     for i, data in enumerate(tqdm(result_data)):
 78 |         # make test cases for each problem
 79 |         tests = {"inputs": [], "outputs": []}
 80 |         tests["inputs"].extend(data["public_tests"]["input"])
 81 |         tests["inputs"].extend(data["private_tests"]["input"])
 82 |         tests["outputs"].extend(data["public_tests"]["output"])
 83 |         tests["outputs"].extend(data["private_tests"]["output"])
 84 |         assert len(tests["inputs"]) == len(tests["outputs"])
 85 |         
 86 |         time_limit = test_dataset[i]["time_limit"]["seconds"]
 87 |         passed = []
 88 |         for code in data["extracted_solutions"]:
 89 |             manager = multiprocessing.Manager()
 90 |             manager_list = manager.list()
 91 |             p = multiprocessing.Process(
 92 |                 target=_temp_run, args=(code, tests, manager_list)
 93 |             )
 94 |             p.start()
 95 |             p.join(timeout=time_limit + 1)
 96 |             
 97 |             if p.is_alive():
 98 |                 p.kill()
 99 |             if not manager_list:
100 |                 passed.append(0)
101 |             else:
102 |                 if manager_list[0] == True:
103 |                     passed.append(1)
104 |                 else:
105 |                     passed.append(0)
106 |                     
107 |         result_data[i]["passed"] = passed  # new data
108 |         passed_results.append(passed)
109 |         
110 |     print(f"time: {time.time() - start:.2f}s")
111 |     ks = [args.k]
112 |     performance = compute_pass_at_ks(passed_results, ks)
113 |     print(f"pass@{ks[0]}: {performance}")
114 |     # statistics for one dot in the correlation figure
115 |     # add pass information to result_data and save
116 |     write_dict_to_jsonl(result_data, os.path.join(base_directory, "result/ft/result", result_file))
117 |     print(f'{result_file} saved.')
118 |     
119 |     print('program ends.')
120 |     
121 | 
122 |         
123 | 
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     main()
128 | 


--------------------------------------------------------------------------------
/codecontests/evaluate_corr.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import multiprocessing
  4 | import time
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from utils.utils_evaluate import compute_pass_at_ks, verify_code_official
  9 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl
 10 | 
 11 | from datasets import load_dataset
 12 | 
 13 | from scipy import stats
 14 | 
 15 | 
 16 | def _temp_run(code, tests, passed):
 17 |     try:
 18 |         flag, _ = verify_code_official(tests, code)
 19 |         passed.append(flag)
 20 |     except Exception as e:
 21 |         pass
 22 | 
 23 | 
 24 | def main():
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument(
 27 |         "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf"
 28 |     )
 29 |     parser.add_argument("--num_icl_shot", type=int, required=True, default=2)
 30 |     parser.add_argument(
 31 |         "--num_gen",
 32 |         type=int,
 33 |         required=True,
 34 |         default=1,
 35 |         help="number of solutions generated per problem",
 36 |     )
 37 |     parser.add_argument(
 38 |         "--temperature",
 39 |         type=float,
 40 |         default=0,
 41 |         required=True,
 42 |         help="0 means greedy decoding for vllm",
 43 |     )
 44 |     parser.add_argument("--k", type=int, required=True, default=1, help="k of pass@k")
 45 |     parser.add_argument(
 46 |         "--metric",
 47 |         type=str,
 48 |         required=True,
 49 |         default='style',
 50 |         help="code metric (e.g., style or modularity)",
 51 |     )
 52 |     args = parser.parse_args()
 53 | 
 54 |     base_directory = os.path.dirname(__file__)
 55 |     test_dataset = load_dataset(
 56 |         "deepmind/code_contests", split="test", cache_dir="/data/huggingface/datasets"
 57 |     )
 58 |     
 59 |     performances = []
 60 |     metrics = []
 61 |     
 62 |     for code_idx in tqdm(range(100)):
 63 |         result_file = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.metric}_{code_idx}code_icl_result.jsonl"
 64 |         if not os.path.exists(os.path.join(base_directory, "result", result_file)):
 65 |             continue
 66 |         
 67 |         if os.path.exists(os.path.join(base_directory, "result", "corr_exp_evaluation_result", result_file)):
 68 |             continue
 69 |         
 70 |         result_data = read_jsonl_to_dict(os.path.join(base_directory, "result", result_file))
 71 |         assert len(result_data) == 165
 72 |         
 73 |         start = time.time()
 74 |         passed_results = []
 75 |         for i, data in enumerate(result_data):
 76 |             # make test cases for each problem
 77 |             tests = {"inputs": [], "outputs": []}
 78 |             tests["inputs"].extend(data["public_tests"]["input"])
 79 |             tests["inputs"].extend(data["private_tests"]["input"])
 80 |             tests["outputs"].extend(data["public_tests"]["output"])
 81 |             tests["outputs"].extend(data["private_tests"]["output"])
 82 |             assert len(tests["inputs"]) == len(tests["outputs"])
 83 |             
 84 |             time_limit = test_dataset[i]["time_limit"]["seconds"]
 85 |             passed = []
 86 |             for code in data["extracted_solutions"]:
 87 |                 manager = multiprocessing.Manager()
 88 |                 manager_list = manager.list()
 89 |                 p = multiprocessing.Process(
 90 |                     target=_temp_run, args=(code, tests, manager_list)
 91 |                 )
 92 |                 p.start()
 93 |                 p.join(timeout=time_limit + 1)
 94 |                 
 95 |                 if p.is_alive():
 96 |                     p.kill()
 97 |                 if not manager_list:
 98 |                     passed.append(0)
 99 |                 else:
100 |                     if manager_list[0] == True:
101 |                         passed.append(1)
102 |                     else:
103 |                         passed.append(0)
104 |                         
105 |             result_data[i]["passed"] = passed  # new data
106 |             passed_results.append(passed)
107 |             
108 |         print(f"time: {time.time() - start:.2f}s")
109 |         ks = [args.k]
110 |         performance = compute_pass_at_ks(passed_results, ks)
111 |         print(f"pass@{ks[0]}: {performance}")
112 |         # statistics for one dot in the correlation figure
113 |         performances.append(performance)
114 |         metrics.append(result_data[0]['demonstration'])
115 |         # add pass information to result_data and save
116 |         write_dict_to_jsonl(result_data, os.path.join(base_directory, "result", "corr_exp_evaluation_result", result_file))
117 |         print(f'{result_file} saved.')
118 |     
119 |     print('program ends.')
120 |     
121 |     # # compute correlation
122 |     # pass_at_k = [e[args.k] for e in performances]
123 |     
124 |     # if args.metric == 'style':
125 |     #     style = [m['score_style'][0]['score_pep8'] for m in metrics]
126 |     #     print(stats.pearsonr(style, pass_at_k))
127 |     #     print(stats.spearmanr(style, pass_at_k))
128 |     # elif args.metric == 'modularity':
129 |     #     modularity = [m['score_modularity'][0] for m in metrics]
130 |     #     print(stats.pearsonr(modularity, pass_at_k))
131 |     #     print(stats.spearmanr(modularity, pass_at_k))
132 |         
133 | 
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     main()
138 | 


--------------------------------------------------------------------------------
/codecontests/construct_mc_sc_divided_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 11,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# divide existing codes into monolithic and modular codes by certain criteria\n",
 10 |     "# (ex, average cc and number of modules used)\n",
 11 |     "def divide_into_monolithic_and_modular_codes(dataset, cc_limit=10, min_num_module=3):\n",
 12 |     "    from utils.utils import count_module_written\n",
 13 |     "    \n",
 14 |     "    \n",
 15 |     "    new_dataset = []\n",
 16 |     "\n",
 17 |     "    for data in dataset:\n",
 18 |     "        # save basic information\n",
 19 |     "        new_data = {}\n",
 20 |     "        new_data['problem_name'] = data['name']\n",
 21 |     "        new_data['problem_description'] = data['description']\n",
 22 |     "        new_data['public_tests'] = data['public_tests']\n",
 23 |     "        new_data['private_tests'] = data['private_tests']\n",
 24 |     "\n",
 25 |     "        passed = data['solutions']['passed']\n",
 26 |     "        cc = data['solutions']['cc']\n",
 27 |     "        solution = data['solutions']['solution']\n",
 28 |     "        module_list = data['solutions']['modules']\n",
 29 |     "\n",
 30 |     "        assert(len(passed) == len(cc) == len(solution) == len(module_list))\n",
 31 |     "\n",
 32 |     "        # 1. get monolithic code\n",
 33 |     "        monolithic_code_index = []\n",
 34 |     "        for i, modules in enumerate(module_list):\n",
 35 |     "            # filter solution that does not pass the test case\n",
 36 |     "            if not passed[i]:\n",
 37 |     "                continue\n",
 38 |     "            \n",
 39 |     "            if len(modules) == 0 and cc[i] >= cc_limit:\n",
 40 |     "                monolithic_code_index.append(i)\n",
 41 |     "\n",
 42 |     "        # no monolithic code candidate exists\n",
 43 |     "        # if len(monolithic_code_index) == 0:\n",
 44 |     "            # continue\n",
 45 |     "\n",
 46 |     "        tmp = {}\n",
 47 |     "        tmp['monolithic_code'] = [solution[i] for i in monolithic_code_index]\n",
 48 |     "        tmp['monolithic_code_cc'] = [cc[i] for i in monolithic_code_index]\n",
 49 |     "        new_data['monolithic_codes'] = tmp\n",
 50 |     "        \n",
 51 |     "        # 2. get modular code\n",
 52 |     "        modular_code_index = []\n",
 53 |     "        for i, (code, modules) in enumerate(zip(solution, module_list)):\n",
 54 |     "            # filter solution that does not pass the test case\n",
 55 |     "            if not passed[i]:\n",
 56 |     "                continue\n",
 57 |     "            \n",
 58 |     "            if len(modules) < min_num_module: continue # at least three modules in the code\n",
 59 |     "            module_use_count = [count_module_written(code, module) for module in modules]\n",
 60 |     "            if all(count >= 2 for count in module_use_count): # all modules must be used\n",
 61 |     "                if cc[i] < cc_limit: # and cc of code must be under 10\n",
 62 |     "                    modular_code_index.append(i)\n",
 63 |     "        \n",
 64 |     "        # no modular code candidate exists\n",
 65 |     "        # if len(modular_code_index) == 0:\n",
 66 |     "            # continue\n",
 67 |     "        \n",
 68 |     "        tmp = {}\n",
 69 |     "        tmp['modular_code'] = [solution[i] for i in modular_code_index]\n",
 70 |     "        tmp['modular_code_cc'] = [cc[i] for i in modular_code_index]\n",
 71 |     "        new_data['modular_codes'] = tmp\n",
 72 |     "\n",
 73 |     "        new_dataset.append(new_data)\n",
 74 |     "        \n",
 75 |     "        \n",
 76 |     "    # 3. remove question without pair data is collected\n",
 77 |     "    remove_index = []\n",
 78 |     "    for i, data in enumerate(new_dataset):\n",
 79 |     "        # at least one monolithic code must exist per problem\n",
 80 |     "        # it is okay to have no modular code\n",
 81 |     "        if len(data['monolithic_codes']['monolithic_code']) == 0:\n",
 82 |     "            remove_index.append(i)\n",
 83 |     "    new_dataset = [new_dataset[i] for i in range(len(new_dataset)) if i not in remove_index]\n",
 84 |     "\n",
 85 |     "        \n",
 86 |     "    return new_dataset"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "### load my codecontests dataset and extract problems with both sc and mc codes"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 1,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stderr",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "/home/kdy20401/anaconda3/envs/code/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
106 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl\n",
112 |     "import os\n",
113 |     "\n",
114 |     "train_dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))\n",
115 |     "\n",
116 |     "_train_dataset = divide_into_monolithic_and_modular_codes(train_dataset)\n",
117 |     "\n",
118 |     "write_dict_to_jsonl(_train_dataset, os.path.join(os.getcwd(), 'data', 'my_code_contests_divided_train.jsonl'))\n"
119 |    ]
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "code",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.9.18"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 2
143 | }
144 | 


--------------------------------------------------------------------------------
/codecontests/icl_ft.sh:
--------------------------------------------------------------------------------
  1 | #/bin/bash
  2 | 
  3 | # num_gpu=4
  4 | # dtype=float16
  5 | # num_icl_shot=2
  6 | # num_gen=50
  7 | # temperature=0.6
  8 | # swap_space=8
  9 | # code_type=modular
 10 | 
 11 | # for size in 34; do
 12 | #     for seed in 27 42 101 134 169; do
 13 | #         CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \
 14 | #         --seed ${seed} \
 15 | #         --model meta-llama/CodeLlama-${size}b-hf \
 16 | #         --num_gpu ${num_gpu} \
 17 | #         --dtype ${dtype} \
 18 | #         --num_icl_shot ${num_icl_shot} \
 19 | #         --num_gen ${num_gen} \
 20 | #         --temperature ${temperature} \
 21 | #         --max_new_token 1024 \
 22 | #         --top_p 0.95 \
 23 | #         --swap_space ${swap_space} \
 24 | #         --code_type ${code_type} \
 25 | #         > log/cl${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
 26 | #         echo cl${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends
 27 | #     done
 28 | # done 
 29 | 
 30 | # # CL 7b, pass@1(n=10)
 31 | # num_gpu=4
 32 | # dtype=float16
 33 | # num_icl_shot=2
 34 | # num_gen=10
 35 | # temperature=0.1
 36 | # swap_space=8
 37 | # code_type=monolithic
 38 | 
 39 | # for size in 34; do
 40 | #     for seed in 27 42 101 134 169; do
 41 | #         CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \
 42 | #         --seed ${seed} \
 43 | #         --model meta-llama/CodeLlama-${size}b-hf \
 44 | #         --num_gpu ${num_gpu} \
 45 | #         --dtype ${dtype} \
 46 | #         --num_icl_shot ${num_icl_shot} \
 47 | #         --num_gen ${num_gen} \
 48 | #         --temperature ${temperature} \
 49 | #         --max_new_token 1024 \
 50 | #         --top_p 0.95 \
 51 | #         --swap_space ${swap_space} \
 52 | #         --code_type ${code_type} \
 53 | #         > log/inference/2shot_mc/cl${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
 54 | #         echo cl${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends
 55 | #     done
 56 | # done    
 57 | 
 58 | 
 59 | # # DS
 60 | # num_gpu=4
 61 | # dtype=bfloat16
 62 | # num_icl_shot=2
 63 | # num_gen=50
 64 | # temperature=0.1
 65 | # swap_space=8
 66 | # code_type=modular
 67 | 
 68 | # for size in 33; do
 69 | #     for seed in 27 42 101 134 169; do
 70 | #         CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \
 71 | #         --seed ${seed} \
 72 | #         --model deepseek-ai/deepseek-coder-${size}b-base \
 73 | #         --num_gpu ${num_gpu} \
 74 | #         --dtype ${dtype} \
 75 | #         --num_icl_shot ${num_icl_shot} \
 76 | #         --num_gen ${num_gen} \
 77 | #         --temperature ${temperature} \
 78 | #         --max_new_token 1024 \
 79 | #         --top_p 0.95 \
 80 | #         --swap_space ${swap_space} \
 81 | #         --code_type ${code_type} \
 82 | #         > log/inference/2shot_mc/ds${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
 83 | #         echo ds${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends
 84 | #     done
 85 | # done    
 86 | 
 87 | 
 88 | # num_gpu=4
 89 | # dtype=float16
 90 | # num_icl_shot=2
 91 | # num_gen=50
 92 | # temperature=0.6
 93 | # swap_space=8
 94 | # code_type=modular
 95 | 
 96 | # for size in 34; do
 97 | #     for seed in 27 42 101 134 169; do
 98 | #         CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \
 99 | #         --seed ${seed} \
100 | #         --model meta-llama/CodeLlama-${size}b-hf \
101 | #         --num_gpu ${num_gpu} \
102 | #         --dtype ${dtype} \
103 | #         --num_icl_shot ${num_icl_shot} \
104 | #         --num_gen ${num_gen} \
105 | #         --temperature ${temperature} \
106 | #         --max_new_token 1024 \
107 | #         --top_p 0.95 \
108 | #         --swap_space ${swap_space} \
109 | #         --code_type ${code_type} \
110 | #         > log/inference/2shot_mc/cl${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
111 | #         echo cl${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends
112 | #     done
113 | # done    
114 | 
115 | # # DS
116 | # num_gpu=4
117 | # dtype=bfloat16
118 | # num_icl_shot=2
119 | # num_gen=50
120 | # temperature=0.6
121 | # swap_space=8
122 | # code_type=modular
123 | 
124 | # for size in 33; do
125 | #     for seed in 27 42 101 134 169; do
126 | #         CUDA_VISIBLE_DEVICES=0,1,2,3 python icl.py \
127 | #         --seed ${seed} \
128 | #         --model deepseek-ai/deepseek-coder-${size}b-base \
129 | #         --num_gpu ${num_gpu} \
130 | #         --dtype ${dtype} \
131 | #         --num_icl_shot ${num_icl_shot} \
132 | #         --num_gen ${num_gen} \
133 | #         --temperature ${temperature} \
134 | #         --max_new_token 1024 \
135 | #         --top_p 0.95 \
136 | #         --swap_space ${swap_space} \
137 | #         --code_type ${code_type} \
138 | #         > log/inference/2shot_mc/ds${size}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1
139 | #         echo ds${size} ${num_icl_shot}shot ${code_type} inference ${seed} ends
140 | #     done
141 | # done
142 | 
143 | 
144 | # inference from ft checkpoint, pass@1(n=10)
145 | seed=27
146 | model=deepseek
147 | num_gpu=2
148 | dtype=float16
149 | num_icl_shot=0
150 | temperature=0.6
151 | code_type=monolithic
152 | swap_space=16
153 | chkpt=_final
154 | num_gen=50 #
155 | debug_mode=0 #
156 | 
157 | # for low and high model simultaneously
158 | degree=low
159 | CUDA_VISIBLE_DEVICES=0,1 nohup python icl_ft.py \
160 | --seed ${seed} \
161 | --model /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/${model}/${degree}/ \
162 | --num_gpu ${num_gpu} \
163 | --dtype ${dtype} \
164 | --num_icl_shot ${num_icl_shot} \
165 | --num_gen ${num_gen} \
166 | --temperature ${temperature} \
167 | --max_new_token 1024 \
168 | --top_p 0.95 \
169 | --swap_space ${swap_space} \
170 | --code_type ${code_type} \
171 | --degree ${degree} \
172 | --debug_mode ${debug_mode} \
173 | --chkpt ${chkpt} \
174 | > log/inference/tmp/${model}_${degree}_mod_chkpt${chkpt}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 &
175 | 
176 | degree=high
177 | CUDA_VISIBLE_DEVICES=2,3 nohup python icl_ft.py \
178 | --seed ${seed} \
179 | --model /data/kdy20401/Workspace/Proj-Code-Generation/MC/tmp/${model}/${degree}/ \
180 | --num_gpu ${num_gpu} \
181 | --dtype ${dtype} \
182 | --num_icl_shot ${num_icl_shot} \
183 | --num_gen ${num_gen} \
184 | --temperature ${temperature} \
185 | --max_new_token 1024 \
186 | --top_p 0.95 \
187 | --swap_space ${swap_space} \
188 | --code_type ${code_type} \
189 | --degree ${degree} \
190 | --debug_mode ${debug_mode} \
191 | --chkpt ${chkpt} \
192 | > log/inference/tmp/${model}_${degree}_mod_chkpt${chkpt}_${num_icl_shot}shot_${code_type}_${temperature}temp_${num_gen}gen_${seed}.log 2>&1 &
193 | wait &&
194 | echo done!


--------------------------------------------------------------------------------
/codecontests/data/monolithic_2shot_demonstration_169seed.jsonl:
--------------------------------------------------------------------------------
1 | {"problem_description": ["A prime number is a number which has exactly two distinct divisors: one and itself. For example, numbers 2, 7, 3 are prime, and 1, 6, 4 are not.\n\nThe next prime number after x is the smallest prime number greater than x. For example, the next prime number after 2 is 3, and the next prime number after 3 is 5. Note that there is exactly one next prime number after each number. So 5 is not the next prime number for 2.\n\nOne cold April morning Panoramix predicted that soon Kakofonix will break free from his straitjacket, and this will be a black day for the residents of the Gallic countryside.\n\nPanoramix's prophecy tells that if some day Asterix and Obelix beat exactly x Roman soldiers, where x is a prime number, and next day they beat exactly y Roman soldiers, where y is the next prime number after x, then it's time to wait for Armageddon, for nothing can shut Kakofonix up while he sings his infernal song.\n\nYesterday the Gauls beat n Roman soldiers and it turned out that the number n was prime! Today their victims were a troop of m Romans (m > n). Determine whether the Gauls should wait for the black day after today's victory of Asterix and Obelix?\n\nInput\n\nThe first and only input line contains two positive integers \u2014 n and m (2 \u2264 n < m \u2264 50). It is guaranteed that n is prime.\n\nPretests contain all the cases with restrictions 2 \u2264 n < m \u2264 4.\n\nOutput\n\nPrint YES, if m is the next prime number after n, or NO otherwise.\n\nExamples\n\nInput\n\n3 5\n\n\nOutput\n\nYES\n\nInput\n\n7 11\n\n\nOutput\n\nYES\n\nInput\n\n7 9\n\n\nOutput\n\nNO", "A bracket sequence is a string that is one of the following:\n\n1. An empty string;\n2. The concatenation of `(`, A, and `)` in this order, for some bracket sequence A ;\n3. The concatenation of A and B in this order, for some non-empty bracket sequences A and B /\n\n\n\nGiven are N strings S_i. Can a bracket sequence be formed by concatenating all the N strings in some order?\n\nConstraints\n\n* 1 \\leq N \\leq 10^6\n* The total length of the strings S_i is at most 10^6.\n* S_i is a non-empty string consisting of `(` and `)`.\n\nInput\n\nInput is given from Standard Input in the following format:\n\n\nN\nS_1\n:\nS_N\n\n\nOutput\n\nIf a bracket sequence can be formed by concatenating all the N strings in some order, print `Yes`; otherwise, print `No`.\n\nExamples\n\nInput\n\n2\n)\n(()\n\n\nOutput\n\nYes\n\n\nInput\n\n2\n)(\n()\n\n\nOutput\n\nNo\n\n\nInput\n\n4\n((()))\n((((((\n))))))\n()()()\n\n\nOutput\n\nYes\n\n\nInput\n\n3\n(((\n)\n)\n\n\nOutput\n\nNo"], "public_tests": [{"input": ["7 9\n", "3 5\n", "7 11\n"], "output": ["NO\n", "YES\n", "YES\n"]}, {"input": ["3\n(((\n)\n)", "2\n)\n(()", "4\n((()))\n((((((\n))))))\n()()()", "2\n)(\n()"], "output": ["No", "Yes", "Yes", "No"]}], "private_tests": [{"input": ["2 6\n", "31 33\n", "2 11\n", "41 49\n", "13 17\n", "23 29\n", "7 8\n", "5 13\n", "47 50\n", "43 47\n", "17 19\n", "5 9\n", "2 50\n", "2 3\n", "3 7\n", "13 20\n", "11 13\n", "19 23\n", "5 11\n", "3 9\n", "5 6\n", "23 25\n", "43 49\n", "5 7\n", "3 4\n", "7 13\n", "3 6\n", "37 41\n", "13 15\n", "2 7\n", "5 15\n", "47 48\n", "2 5\n", "31 37\n", "29 31\n", "19 21\n", "2 4\n", "41 43\n", "47 49\n"], "output": ["NO\n", "NO\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "YES\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "NO\n", "NO\n", "NO\n", "YES\n", "NO\n", "NO\n", "NO\n", "YES\n", "NO\n", "NO\n", "NO\n", "NO\n", "NO\n", "YES\n", "YES\n", "NO\n", "NO\n", "YES\n", "NO\n"]}, {"input": [], "output": []}], "transformed_sc": ["n, m = map(int, input().split())\nfound = False\nnum = n + 1\nwhile True:\n    if num < 2:\n        prime = False\n    else:\n        prime = True\n        for i in range(2, int(num**0.5) + 1):\n            if num % i == 0:\n                prime = False\n                break\n    if prime:\n        if num == m:\n            print(\"YES\")\n            found = True\n        break\n    num += 1\nif not found:\n    print(\"NO\")", "import sys\n\n\nn = int(input())\ns = [list(input()) for _ in range(n)]\n\nct1_total = 0\nct2_total = 0\nL = []\n\nfor i in range(n):\n    ct1 = 0\n    ct2 = 0\n\n    for char in s[i]:\n        if char == '(':\n            ct1 += 1\n        else:\n            ct2 += 1\n            \n    ct1_total += ct1\n    ct2_total += ct2\n    \n    ct1 = 0\n    ct2 = 0\n    l = [0]\n\n    for char in s[i]:\n        if char == '(':\n            ct1 += 1\n            l.append(ct1)\n        else:\n            ct2 += 1\n            l.append(-ct2)\n            \n    L.append(l)\n\nif ct1_total != ct2_total:\n    result = 'No'\n    print(result)\n    sys.exit()\n\nL1 = []\nL2 = []\n\nfor l in L:\n    if l[-1] >= 0:\n        L1.append((min(l), l[-1]))\n    else:\n        L2.append((min(l) - l[-1], -l[-1]))\n\nL1.sort()\nL1.reverse()\nct4 = 0\n\nresult1 = ''\nfor i in range(len(L1)):\n    if ct4 + L1[i][0] < 0:\n        result1 = 'No'\n    ct4 += L1[i][1]\n\nif result1 == '':\n    result1 = 'Yes'\n\nL2.sort()\nL2.reverse()\nct4 = 0\n\nresult2 = ''\nfor i in range(len(L2)):\n    if ct4 + L2[i][0] < 0:\n        result2 = 'No'\n    ct4 += L2[i][1]\n\nif result2 == '':\n    result2 = 'Yes'\n\nif result1 == 'Yes' and result2 == 'Yes':\n    result = 'Yes'\nelse:\n    result = 'No'\n\nprint(result)"], "sc": ["n,m=map(int,input().split())\ni=n+1\nfor i in range(i,m+1):\n    t=0\n    for j in range(2,i):\n        if(i%j==0):\n            t=1\n            break\n\n    if((i==m and t==1 )or t==0 and i!=m):\n        print(\"NO\")\n        break\n    elif(i==m and t==0):\n        print(\"YES\")\n        break\n    else:\n        continue\n    \n        \n        \n", "import sys\nn=int(input())\ns=[list(input()) for i in range(n)]\nL1=[]\nL2=[]\nct1=0;ct2=0\nfor i in range(n):\n  ct3=0\n  l=[0]\n  for j in range(len(s[i])):\n    if s[i][j]=='(':\n      ct1+=1\n      ct3+=1\n      l.append(ct3)\n    else:\n      ct2+=1\n      ct3-=1\n      l.append(ct3)\n  if l[-1]>=0:\n    L1.append((min(l),l[-1]))\n  else:\n    L2.append((min(l)-l[-1],-l[-1]))\nif ct1!=ct2:\n  print('No')\n  sys.exit()\n\nL1.sort()\nL1.reverse()\nct4=0\nfor i in range(len(L1)):\n  if ct4+L1[i][0]<0:\n    print('No')\n    sys.exit()\n  ct4+=L1[i][1]\n\nL2.sort()\nL2.reverse()\nct5=0\nfor i in range(len(L2)):\n  if ct5+L2[i][0]<0:\n    print('No')\n    sys.exit()\n  ct5+=L2[i][1]\n\nprint('Yes')"], "sc_cc": [10.0, 11.0], "transformed_mc": ["\ndef is_prime(num):\n    if num < 2:\n        return False\n    for i in range(2, int(num**0.5) + 1):\n        if num % i == 0:\n            return False\n    return True\n\ndef find_next_prime(n):\n    i = n + 1\n    while True:\n        if is_prime(i):\n            return i\n        i += 1\n\ndef check_for_black_day(n, m):\n    next_prime = find_next_prime(n)\n    if next_prime == m:\n        return \"YES\"\n    else:\n        return \"NO\"\n\ndef main():\n    n, m = map(int, input().split())\n    result = check_for_black_day(n, m)\n    print(result)\n\nif __name__ == '__main__':\n    main()\n", "import sys\n\ndef read_input():\n    n = int(input())\n    s = [list(input()) for _ in range(n)]\n    return n, s\n\ndef count_brackets(s):\n    ct1 = 0\n    ct2 = 0\n\n    for char in s:\n        if char == '(':\n            ct1 += 1\n        else:\n            ct2 += 1\n\n    return ct1, ct2\n\ndef process_string(s):\n    ct1 = 0\n    ct2 = 0\n    l = [0]\n\n    for char in s:\n        if char == '(':\n            ct1 += 1\n            l.append(ct1)\n        else:\n            ct2 += 1\n            l.append(-ct2)\n\n    return l\n\ndef check_valid_sequences(L):\n    L1 = []\n    L2 = []\n\n    for l in L:\n        if l[-1] >= 0:\n            L1.append((min(l), l[-1]))\n        else:\n            L2.append((min(l) - l[-1], -l[-1]))\n\n    return L1, L2\n\ndef check_sequence_order(L, ct):\n    L.sort()\n    L.reverse()\n    ct4 = 0\n\n    for i in range(len(L)):\n        if ct4 + L[i][0] < 0:\n            return 'No'\n        ct4 += L[i][1]\n\n    return 'Yes'\n\ndef check_bracket_sequence(n, s):\n    ct1_total = 0\n    ct2_total = 0\n    L = []\n\n    for i in range(n):\n        ct1, ct2 = count_brackets(s[i])\n        ct1_total += ct1\n        ct2_total += ct2\n        l = process_string(s[i])\n        L.append(l)\n\n    if ct1_total != ct2_total:\n        return 'No'\n\n    L1, L2 = check_valid_sequences(L)\n\n    result1 = check_sequence_order(L1, ct1_total)\n    result2 = check_sequence_order(L2, ct2_total)\n\n    if result1 == 'Yes' and result2 == 'Yes':\n        return 'Yes'\n    else:\n        return 'No'\n\ndef main():\n    n, s = read_input()\n    result = check_bracket_sequence(n, s)\n    print(result)\n\nif __name__ == '__main__':\n    main()"]}
2 | 


--------------------------------------------------------------------------------
/codecontests/construct_demonstration_for_correlation_experiment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Calculate code properties among 10% of original data and save"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "import random\n",
 18 |     "from datasets import Dataset\n",
 19 |     "from utils.utils import get_code_style_score, get_code_modularity_score, read_jsonl_to_dict, write_dict_to_jsonl\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "def compute_code_score(example):\n",
 23 |     "    code = example['code']\n",
 24 |     "    try:\n",
 25 |     "        score_style = get_code_style_score(code)\n",
 26 |     "        score_modularity = get_code_modularity_score(code)\n",
 27 |     "    except Exception:\n",
 28 |     "        score_style = {\n",
 29 |     "            'score_var': -1.0,\n",
 30 |     "            'score_pep8': -1.0,\n",
 31 |     "            'score_style': -1.0,\n",
 32 |     "        }\n",
 33 |     "        score_modularity = -1.0\n",
 34 |     "\n",
 35 |     "    example['score_style'] = score_style\n",
 36 |     "    example['score_modularity'] = score_modularity\n",
 37 |     "    return example\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "def check_code_score(example):\n",
 41 |     "    return example['score_style']['score_var'] >= 0 and example['score_style']['score_pep8'] >= 0 and example['score_modularity'] >= 0\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))\n",
 45 |     "demonstration = []\n",
 46 |     "\n",
 47 |     "# aggregate demonstration code\n",
 48 |     "# keys for dataset: dict_keys(['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file'])\n",
 49 |     "# keys for solutions: dict_keys(['cc', 'modules', 'passed', 'solution'])\n",
 50 |     "for data in dataset:\n",
 51 |     "    for i in range(len(data['solutions']['solution'])):\n",
 52 |     "        if data['solutions']['passed'][i]:\n",
 53 |     "            demonstration.append(\n",
 54 |     "                {\n",
 55 |     "                    'description': data['description'],\n",
 56 |     "                    'code': data['solutions']['solution'][i],\n",
 57 |     "                    # more information?\n",
 58 |     "                }\n",
 59 |     "            )\n",
 60 |     "\n",
 61 |     "# calculate code metrics\n",
 62 |     "random.seed(42)\n",
 63 |     "demonstration = random.sample(demonstration, len(demonstration) // 10) # 10% of total data\n",
 64 |     "demonstration = Dataset.from_list(demonstration)\n",
 65 |     "demonstration = demonstration.map(compute_code_score, num_proc=16)\n",
 66 |     "demonstration = demonstration.filter(check_code_score, num_proc=16)\n",
 67 |     "\n",
 68 |     "# save\n",
 69 |     "# demonstration.save_to_disk(os.path.join(os.getcwd(), 'data', 'demonstration'))\n",
 70 |     "write_dict_to_jsonl(list(demonstration), os.path.join(os.getcwd(), 'data', 'demonstration.jsonl'))"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Get 100 demonstrations of particular code property with evenness"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "import os\n",
 87 |     "import random\n",
 88 |     "import numpy as np\n",
 89 |     "import pandas as pd\n",
 90 |     "import matplotlib.pyplot as plt\n",
 91 |     "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score, get_code_modularity_score, get_average_length_of_variables\n",
 92 |     "\n",
 93 |     "\n",
 94 |     "random.seed(42) # for reproducibility\n",
 95 |     "num_sample = 10 # number of samples to be sampled from each bin\n",
 96 |     "\n",
 97 |     "# load demonstration pool\n",
 98 |     "# each data consists of (problem description, code, style score, modularity score)\n",
 99 |     "file_name = 'demonstration'\n",
100 |     "path = f'/home/kdy20401/Workspace/Proj-Code-Generation/MC/data/{file_name}.jsonl'\n",
101 |     "demonstration = read_jsonl_to_dict(path)\n",
102 |     "print(f'number of codes in demonstration pool: {len(demonstration)}')\n",
103 |     "\n",
104 |     "code = []\n",
105 |     "style = [] # score_pep8\n",
106 |     "modularity = [] # score_modularity\n",
107 |     "var_len = []\n",
108 |     "for data in demonstration:\n",
109 |     "    code.append(data['code'])\n",
110 |     "    style.append(data['score_style']['score_pep8'])\n",
111 |     "    modularity.append(data['score_modularity'])\n",
112 |     "    var_len.append(get_average_length_of_variables(data['code']))\n",
113 |     "\n",
114 |     "style_df = pd.DataFrame({'style': np.array(style)})\n",
115 |     "modularity_df = pd.DataFrame({'modularity': np.array(modularity)})\n",
116 |     "var_len_df = pd.DataFrame({'var_len': np.array(var_len)})\n",
117 |     "\n",
118 |     "# bins: 0~0.1, 0.1~0.2, ..., 0.9~1.0\n",
119 |     "num_bin = 10\n",
120 |     "bins = np.linspace(0, 1, num_bin + 1)\n",
121 |     "\n",
122 |     "# find the grid cell to which each data point belongs\n",
123 |     "# include_lowest=True makes 0 style or modularity value included in the first bin\n",
124 |     "# style_df['style_bin'] = pd.cut(style_df['style'], bins=bins, labels=False, include_lowest=True)\n",
125 |     "# modularity_df['modularity_bin'] = pd.cut(modularity_df['modularity'], bins=bins, labels=False, include_lowest=True)\n",
126 |     "var_len_df['var_len_bin'] = pd.cut(var_len_df['var_len'], bins=bins, labels=False, include_lowest=True)\n",
127 |     "\n",
128 |     "# sample data points from each bin\n",
129 |     "# if the number of data points in the bin is less than num_sample, duplication can occur\n",
130 |     "# style_sampled_points = style_df.groupby(['style_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n",
131 |     "# modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n",
132 |     "var_len_sampled_points = var_len_df.groupby(['var_len_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n",
133 |     "\n",
134 |     "# style_sampled_points.index => (style_bin, code_index)\n",
135 |     "# (deduplicated) index of sampled data points \n",
136 |     "# style_index = list(set([e[1] for e in style_sampled_points.index]))\n",
137 |     "# modularity_index = list(set([e[1] for e in modularity_sampled_points.index]))\n",
138 |     "var_len_index = list(set([e[1] for e in var_len_sampled_points.index]))\n",
139 |     "\n",
140 |     "#  the number of samples is less than expected\n",
141 |     "# assert len(style_index) == num_bin * num_sample and len(modularity_index) == num_bin * num_sample\n",
142 |     "assert len(var_len_index) == num_bin * num_sample\n",
143 |     "        \n",
144 |     "selected_demonstration_by_style = [demonstration[i] for i in style_index]\n",
145 |     "selected_demonstration_by_modularity = [demonstration[i] for i in modularity_index]\n",
146 |     "selected_demonstration_by_var_len = [demonstration[i] for i in var_len_index]\n",
147 |     "\n",
148 |     "# save each demonstration which has high coverage of style or modularity\n",
149 |     "# write_dict_to_jsonl(selected_demonstration_by_style, os.path.join(os.getcwd(), 'data', 'style_demonstration.jsonl'))\n",
150 |     "# write_dict_to_jsonl(selected_demonstration_by_modularity, os.path.join(os.getcwd(), 'data', 'modularity_demonstration.jsonl'))\n",
151 |     "write_dict_to_jsonl(selected_demonstration_by_var_len, os.path.join(os.getcwd(), 'data', 'var_len_demonstration.jsonl'))\n",
152 |     "\n",
153 |     "# for visualization\n",
154 |     "# plt.scatter(style_sampled_points['style'], np.array([0.5] * len(style_sampled_points)), color='red', label='Sampled Data')\n",
155 |     "# plt.scatter(modularity_sampled_points['modularity'], np.array([0.5] * len(modularity_sampled_points)), color='blue', label='Sampled Data')\n",
156 |     "# plt.xlabel('Style')\n",
157 |     "# plt.ylabel('Modularity (tmp)')\n",
158 |     "# plt.legend()\n",
159 |     "# plt.show()    "
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "language_info": {
165 |    "name": "python"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 2
170 | }
171 | 


--------------------------------------------------------------------------------
/apps/data/2shot_demonstration_101seed.json:
--------------------------------------------------------------------------------
1 | {"problem_id":979,"problem_description":"You are given a grid of size M x N, where each square is colored with some random color among K colors with each having equal probability.\n\nA Good Rectangle is defined as one where all squares lying on the inner border are of the same color.\n\nWhat is the expected number of Good Rectangles in the given grid.\n\n-----Input-----\n\n- \nFirst Line contains M, N, K\n\n-----Output-----\nA single value rounded off to the nearest Integer corresponding to the required answer.\n\n-----Constraints-----\n-  1 <= N <= 105 \n-  1 <= M <= 105 \n-  1 <= K <= 105 \n\n-----Example-----\nInput:\n1 3 1\nOutput:\n6","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"n, m, k = list(map(int, input().split()))\n\nif k == 1:\n x, y = 0, 0\n for p in range(2, n + 1):\n  x += (n - p + 1)\n for p in range(2, m + 1):\n  y += (m - p + 1)\n ans = x * y\n x = 0\n for p in range(1, n + 1):\n  x += (n - p + 1)\n y = 0\n for p in range(1, m + 1):\n  y += (m - p + 1)\n ans += m * x\n ans += n * y\n ans -= n * m\n print(ans)\nelse:\n x, y = 0.0, 0.0\n q = 1.0\n for p in range(2, n + 1):\n  q \/= k * k\n  x += (n - p + 1) * q\n for p in range(2, m + 1):\n  q \/= k * k\n  y += (m - p + 1) * q\n ans = k * x * y\n x = 0.0\n q = 1.0\n for p in range(1, n + 1):\n  x += (n - p + 1) * q\n  q \/= k\n y = 0.0\n q = 1.0\n for p in range(1, m + 1):\n  y += (m - p + 1) * q\n  q \/= k\n ans += m * x\n ans += n * y\n ans -= n * m\n ans += 1e-9\n \n print(\"%.0f\" % ans)","sc_cc":10.0,"mc":"def for1(M,k):\n ret = 0.0\n x = k*k+0.0\n z=x\n for m in range(1,M):\n  ret+=(M-m)\/x\n  x*=z\n return ret \n  \ndef for2(M,k):\n ret = 0.0\n x = k+0.0\n for m in range(1,M):\n  ret+=(M-m)\/x\n  \n  x*=k\n return ret \n  \ndef ans(M,N,K):\n\n return int(round(M*N+M*for2(N,K)+N*for2(M,K)+K*for1(M,K)*for1(N,K),0))\nM,N,K = list(map(int,input().split()))\nprint(ans(M,N,K))","mc_cc":1.5,"transformed_mc":["\ndef calculate_good_rectangles(n, m, k):\n    if k == 1:\n        return calculate_good_rectangles_k1(n, m)\n    else:\n        return calculate_good_rectangles_k(n, m, k)\n\ndef calculate_good_rectangles_k1(n, m):\n    x, y = 0, 0\n    for p in range(2, n + 1):\n        x += (n - p + 1)\n    for p in range(2, m + 1):\n        y += (m - p + 1)\n    ans = x * y\n    x = 0\n    for p in range(1, n + 1):\n        x += (n - p + 1)\n    y = 0\n    for p in range(1, m + 1):\n        y += (m - p + 1)\n    ans += m * x\n    ans += n * y\n    ans -= n * m\n    return ans\n\ndef calculate_good_rectangles_k(n, m, k):\n    x, y = 0.0, 0.0\n    q = 1.0\n    for p in range(2, n + 1):\n        q \/= k * k\n        x += (n - p + 1) * q\n    for p in range(2, m + 1):\n        q \/= k * k\n        y += (m - p + 1) * q\n    ans = k * x * y\n    x = 0.0\n    q = 1.0\n    for p in range(1, n + 1):\n        x += (n - p + 1) * q\n        q \/= k\n    y = 0.0\n    q = 1.0\n    for p in range(1, m + 1):\n        y += (m - p + 1) * q\n        q \/= k\n    ans += m * x\n    ans += n * y\n    ans -= n * m\n    ans += 1e-9\n    return ans\n\ndef main():\n    n, m, k = list(map(int, input().split()))\n    ans = calculate_good_rectangles(n, m, k)\n    print(\"%.0f\" % ans)\n\nif __name__ == '__main__':\n    main()\n"],"transformed_sc":["n, m, k = list(map(int, input().split()))\nif k == 1:\n    x, y = 0, 0\n    for p in range(2, n + 1):\n        x += (n - p + 1)\n    for p in range(2, m + 1):\n        y += (m - p + 1)\n    ans = x * y\n    x = 0\n    for p in range(1, n + 1):\n        x += (n - p + 1)\n    y = 0\n    for p in range(1, m + 1):\n        y += (m - p + 1)\n    ans += m * x\n    ans += n * y\n    ans -= n * m\nelse:\n    x, y = 0.0, 0.0\n    q = 1.0\n    for p in range(2, n + 1):\n        q \/= k * k\n        x += (n - p + 1) * q\n    for p in range(2, m + 1):\n        q \/= k * k\n        y += (m - p + 1) * q\n    ans = k * x * y\n    x = 0.0\n    q = 1.0\n    for p in range(1, n + 1):\n        x += (n - p + 1) * q\n        q \/= k\n    y = 0.0\n    q = 1.0\n    for p in range(1, m + 1):\n        y += (m - p + 1) * q\n        q \/= k\n    ans += m * x\n    ans += n * y\n    ans -= n * m\n    ans += 1e-9\nprint(\"%.0f\" % ans)"]}
2 | {"problem_id":2109,"problem_description":"10^{10^{10}} participants, including Takahashi, competed in two programming contests.\nIn each contest, all participants had distinct ranks from first through 10^{10^{10}}-th.\nThe score of a participant is the product of his\/her ranks in the two contests.\nProcess the following Q queries:\n - In the i-th query, you are given two positive integers A_i and B_i. Assuming that Takahashi was ranked A_i-th in the first contest and B_i-th in the second contest, find the maximum possible number of participants whose scores are smaller than Takahashi's.\n\n-----Constraints-----\n - 1 \\leq Q \\leq 100\n - 1\\leq A_i,B_i\\leq 10^9(1\\leq i\\leq Q)\n - All values in input are integers.\n\n-----Input-----\nInput is given from Standard Input in the following format:\nQ\nA_1 B_1\n:\nA_Q B_Q\n\n-----Output-----\nFor each query, print the maximum possible number of participants whose scores are smaller than Takahashi's.\n\n-----Sample Input-----\n8\n1 4\n10 5\n3 3\n4 11\n8 9\n22 40\n8 36\n314159265 358979323\n\n-----Sample Output-----\n1\n12\n4\n11\n14\n57\n31\n671644785\n\nLet us denote a participant who was ranked x-th in the first contest and y-th in the second contest as (x,y).\nIn the first query, (2,1) is a possible candidate of a participant whose score is smaller than Takahashi's. There are never two or more participants whose scores are smaller than Takahashi's, so we should print 1.","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"import math\nq=int(input())\nfor j in range(q):\n\tAB=[int(i) for i in input().split()]\n\tAB.sort()\n\tab=AB[0]*AB[1] \n\tfor i in range(int(math.sqrt(ab)),AB[1]+1):\n\t\tif i*i>=ab:\n\t\t\tx=i-1\n\t\t\tbreak\n\tif x==0:\n\t\tprint(0)\n\t\tcontinue\n\tfor i in range(int(ab\/x),ab+1):\n\t\tif x*i>=ab:\n\t\t\ty=i-1\n\t\t\tbreak\n\tcnt1=2*x\n\tcnts=1 if x==y else 0\n\tcntdd=1 if x-AB[0]>=0 else 0\n\tcntdu=1 if AB[1]-y>=0 and (AB[1]+1)*(x-AB[1]+y)>=ab else 0 \n\tprint(cnt1-cnts-cntdd-cntdu)","sc_cc":12.0,"mc":"import sys\n\ninput = sys.stdin.readline\nQ = int(input())\n\ndef max_score(x, a):\n    ret = 0\n    for p in [(x-4), (x-2), x, (x+2), x+4]:\n        p += x%2\n        p \/\/= 2\n        ret = max(ret, (p+(p>=a))*(x-p+1))\n    return ret\n\nfor _ in range(Q):\n    a, b = list(map(int, input().split()))\n    a, b = min(a, b), max(a, b)\n\n    def is_ok(x):\n        return a*b > max_score(x, a)\n\n    def bisect(ng, ok):\n        while (abs(ok - ng) > 1):\n            mid = (ok + ng) \/\/ 2\n            if is_ok(mid):\n                ok = mid\n            else:\n                ng = mid\n        return ok\n    ans = bisect(2*b+1, a-1)\n    print(ans)","mc_cc":2.0,"transformed_mc":["\nimport math\n\ndef calculate_max_participants(AB):\n    AB.sort()\n    ab = AB[0] * AB[1]\n    \n    x = calculate_x(ab, AB)\n    if x == 0:\n        return 0\n    \n    y = calculate_y(ab, x)\n    \n    cnt1 = 2 * x\n    cnts = 1 if x == y else 0\n    cntdd = 1 if x - AB[0] >= 0 else 0\n    cntdu = 1 if AB[1] - y >= 0 and (AB[1] + 1) * (x - AB[1] + y) >= ab else 0\n    \n    return cnt1 - cnts - cntdd - cntdu\n\ndef calculate_x(ab, AB):\n    for i in range(int(math.sqrt(ab)), AB[1] + 1):\n        if i * i >= ab:\n            return i - 1\n    return 0\n\ndef calculate_y(ab, x):\n    for i in range(int(ab \/ x), ab + 1):\n        if x * i >= ab:\n            return i - 1\n\ndef main():\n    q = int(input())\n    for _ in range(q):\n        AB = [int(i) for i in input().split()]\n        result = calculate_max_participants(AB)\n        print(result)\n\nif __name__ == '__main__':\n    main()\n"],"transformed_sc":["import math\n\nq = int(input())\nfor _ in range(q):\n    AB = [int(i) for i in input().split()]\n    AB.sort()\n    ab = AB[0] * AB[1]\n\n    x = 0\n    for i in range(int(math.sqrt(ab)), AB[1] + 1):\n        if i * i >= ab:\n            x = i - 1\n            break\n    if x == 0:\n        result = 0\n        continue\n\n    for i in range(int(ab \/ x), ab + 1):\n        if x * i >= ab:\n            y = i - 1\n            break\n    \n    cnt1 = 2 * x\n    cnts = 1 if x == y else 0\n    cntdd = 1 if x - AB[0] >= 0 else 0\n    cntdu = 1 if AB[1] - y >= 0 and (AB[1] + 1) * (x - AB[1] + y) >= ab else 0\n    result = cnt1 - cnts - cntdd - cntdu\n    print(result)"]}
3 | 


--------------------------------------------------------------------------------
/codecontests/calculate_corr_between_mos_and_function_call.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Calculate code properties among 10% of original data and save"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "/data/kdy20401/.conda/envs/mc/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 20 |       "  from .autonotebook import tqdm as notebook_tqdm\n",
 21 |       "Map (num_proc=16): 100%|██████████| 126447/126447 [01:02<00:00, 2013.54 examples/s]\n",
 22 |       "Filter (num_proc=16): 100%|██████████| 126447/126447 [00:06<00:00, 19519.37 examples/s]\n"
 23 |      ]
 24 |     }
 25 |    ],
 26 |    "source": [
 27 |     "import os\n",
 28 |     "import random\n",
 29 |     "from datasets import Dataset\n",
 30 |     "from utils.utils import get_code_style_score, get_code_modularity_score, read_jsonl_to_dict, write_dict_to_jsonl\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "def compute_code_score(example):\n",
 34 |     "    code = example['code']\n",
 35 |     "    try:\n",
 36 |     "        score_modularity = get_code_modularity_score(code)\n",
 37 |     "    except Exception:\n",
 38 |     "        score_modularity = -1.0\n",
 39 |     "\n",
 40 |     "    example['score_modularity'] = score_modularity\n",
 41 |     "    return example\n",
 42 |     "\n",
 43 |     "\n",
 44 |     "def check_code_score(example):\n",
 45 |     "    return example['score_modularity'] >= 0\n",
 46 |     "\n",
 47 |     "\n",
 48 |     "dataset = read_jsonl_to_dict(os.path.join(os.getcwd(), 'data', 'my_code_contests_train.jsonl'))\n",
 49 |     "demonstration = []\n",
 50 |     "\n",
 51 |     "# aggregate demonstration code\n",
 52 |     "# keys for dataset: dict_keys(['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file'])\n",
 53 |     "# keys for solutions: dict_keys(['cc', 'modules', 'passed', 'solution'])\n",
 54 |     "for data in dataset:\n",
 55 |     "    for i in range(len(data['solutions']['solution'])):\n",
 56 |     "        if data['solutions']['passed'][i]:\n",
 57 |     "            demonstration.append(\n",
 58 |     "                {\n",
 59 |     "                    'description': data['description'],\n",
 60 |     "                    'code': data['solutions']['solution'][i],\n",
 61 |     "                    # more information?\n",
 62 |     "                }\n",
 63 |     "            )\n",
 64 |     "\n",
 65 |     "# calculate MoS\n",
 66 |     "random.seed(42)\n",
 67 |     "demonstration = random.sample(demonstration, len(demonstration) // 10) # 10% of total data\n",
 68 |     "demonstration = Dataset.from_list(demonstration)\n",
 69 |     "demonstration = demonstration.map(compute_code_score, num_proc=16)\n",
 70 |     "demonstration = demonstration.filter(check_code_score, num_proc=16)\n",
 71 |     "\n",
 72 |     "# save\n",
 73 |     "write_dict_to_jsonl(list(demonstration), os.path.join(os.getcwd(), 'data', 'demonstration_with_new_modularity.jsonl'))"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Get 500 demonstrations"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 19,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "number of codes in demonstration pool: 125659\n"
 93 |      ]
 94 |     },
 95 |     {
 96 |      "name": "stderr",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "/tmp/ipykernel_442372/1728575739.py:34: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
100 |       "  modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "import os\n",
106 |     "import random\n",
107 |     "import numpy as np\n",
108 |     "import pandas as pd\n",
109 |     "import matplotlib.pyplot as plt\n",
110 |     "from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score, get_code_modularity_score, get_average_length_of_variables\n",
111 |     "\n",
112 |     "\n",
113 |     "random.seed(27) # for reproducibility\n",
114 |     "num_sample = 10 # number of samples to be sampled from each bin\n",
115 |     "\n",
116 |     "# load demonstration pool\n",
117 |     "file_name = 'demonstration_with_new_modularity'\n",
118 |     "path = f'/data/kdy20401/Workspace/Proj-Code-Generation/MC/data/{file_name}.jsonl'\n",
119 |     "demonstration = read_jsonl_to_dict(path)\n",
120 |     "print(f'number of codes in demonstration pool: {len(demonstration)}')\n",
121 |     "\n",
122 |     "modularity = [] # score_modularity\n",
123 |     "for data in demonstration:\n",
124 |     "    modularity.append(data['score_modularity'])\n",
125 |     "\n",
126 |     "modularity_df = pd.DataFrame({'modularity': np.array(modularity)})\n",
127 |     "\n",
128 |     "# bins: 0~0.1, 0.1~0.2, ..., 0.9~1.0\n",
129 |     "num_bin = 10\n",
130 |     "bins = np.linspace(0, 1, num_bin + 1)\n",
131 |     "\n",
132 |     "# find the grid cell to which each data point belongs\n",
133 |     "# include_lowest=True makes 0 style or modularity value included in the first bin\n",
134 |     "modularity_df['modularity_bin'] = pd.cut(modularity_df['modularity'], bins=bins, labels=False, include_lowest=True)\n",
135 |     "\n",
136 |     "# sample data points from each bin\n",
137 |     "# if the number of data points in the bin is less than num_sample, duplication can occur\n",
138 |     "modularity_sampled_points = modularity_df.groupby(['modularity_bin']).apply(lambda x: x.sample(num_sample, replace=True if len(x) < num_sample else False))\n",
139 |     "\n",
140 |     "# style_sampled_points.index => (style_bin, code_index)\n",
141 |     "# (deduplicated) index of sampled data points \n",
142 |     "modularity_index = list(set([e[1] for e in modularity_sampled_points.index]))\n",
143 |     "\n",
144 |     "#  the number of samples is less than expected\n",
145 |     "# assert len(style_index) == num_bin * num_sample and len(modularity_index) == num_bin * num_sample\n",
146 |     "assert len(modularity_index) == num_bin * num_sample\n",
147 |     "        \n",
148 |     "selected_demonstration_by_modularity = [demonstration[i] for i in modularity_index]\n",
149 |     "\n",
150 |     "# save each demonstration which has high coverage of style or modularity\n",
151 |     "write_dict_to_jsonl(selected_demonstration_by_modularity, os.path.join(os.getcwd(), 'data', 'modularity_demonstration_with_new_modularity.jsonl'))\n",
152 |     "\n",
153 |     "# # for visualization\n",
154 |     "# plt.scatter(modularity_sampled_points['modularity'], np.array([0.5] * len(modularity_sampled_points)), color='red', label='Sampled Data')\n",
155 |     "# plt.xlabel('MoS')\n",
156 |     "# plt.ylabel('temp')\n",
157 |     "# plt.legend()\n",
158 |     "# plt.show()    "
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## calculate corr between mos and function calls"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 1,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "from utils.utils import count_num_module_calls\n",
175 |     "\n",
176 |     "base_directory = os.getcwd()\n",
177 |     "\n",
178 |     "demonstration_dataset = read_jsonl_to_dict(\n",
179 |     "    \n",
180 |     "    os.path.join(\n",
181 |     "        base_directory,\n",
182 |     "        \"data\",\n",
183 |     "        'modularity_demonstration_with_new_modularity.jsonl',\n",
184 |     "    )  \n",
185 |     ")\n",
186 |     "\n",
187 |     "import matplotlib.pyplot as plt\n",
188 |     "from scipy import stats\n",
189 |     "\n",
190 |     "mos, function_call = [], []\n",
191 |     "for data in demonstration_dataset:\n",
192 |     "    mos.append(data['score_modularity'])\n",
193 |     "    function_call.append(count_num_module_calls(data['code']))\n",
194 |     "    \n",
195 |     "pearsonr_stat = stats.pearsonr(mos, function_call)\n",
196 |     "pearsonr, pearsonr_p = pearsonr_stat.correlation, pearsonr_stat.pvalue\n",
197 |     "spearmanr_stat = stats.spearmanr(mos, function_call)\n",
198 |     "spearmanr, spearmanr_p = spearmanr_stat.correlation, spearmanr_stat.pvalue\n",
199 |     "\n",
200 |     "plt.scatter(mos, function_call, color='red', label='Sampled Data')\n",
201 |     "plt.xlabel('MoS')\n",
202 |     "plt.ylabel('number of function calls')\n",
203 |     "plt.legend()\n",
204 |     "plt.show()\n",
205 |     "\n",
206 |     "print(f'pearsonr: {round(pearsonr, 2)}, pearsonr_p: {round(pearsonr_p, 2)}')\n",
207 |     "print(f'spearmanr: {round(spearmanr, 2)}, spearmanr_p: {round(spearmanr_p, 2)}')"
208 |    ]
209 |   }
210 |  ],
211 |  "metadata": {
212 |   "kernelspec": {
213 |    "display_name": "mc",
214 |    "language": "python",
215 |    "name": "python3"
216 |   },
217 |   "language_info": {
218 |    "codemirror_mode": {
219 |     "name": "ipython",
220 |     "version": 3
221 |    },
222 |    "file_extension": ".py",
223 |    "mimetype": "text/x-python",
224 |    "name": "python",
225 |    "nbconvert_exporter": "python",
226 |    "pygments_lexer": "ipython3",
227 |    "version": "3.9.19"
228 |   }
229 |  },
230 |  "nbformat": 4,
231 |  "nbformat_minor": 2
232 | }
233 | 


--------------------------------------------------------------------------------
/codecontests/data/monolithic_2shot_demonstration_134seed.jsonl:
--------------------------------------------------------------------------------
1 | {"problem_description": ["Nauuo is a girl who loves writing comments.\n\nOne day, she posted a comment on Codeforces, wondering whether she would get upvotes or downvotes.\n\nIt's known that there were x persons who would upvote, y persons who would downvote, and there were also another z persons who would vote, but you don't know whether they would upvote or downvote. Note that each of the x+y+z people would vote exactly one time.\n\nThere are three different results: if there are more people upvote than downvote, the result will be \"+\"; if there are more people downvote than upvote, the result will be \"-\"; otherwise the result will be \"0\".\n\nBecause of the z unknown persons, the result may be uncertain (i.e. there are more than one possible results). More formally, the result is uncertain if and only if there exist two different situations of how the z persons vote, that the results are different in the two situations.\n\nTell Nauuo the result or report that the result is uncertain.\n\nInput\n\nThe only line contains three integers x, y, z (0\u2264 x,y,z\u2264100), corresponding to the number of persons who would upvote, downvote or unknown.\n\nOutput\n\nIf there is only one possible result, print the result : \"+\", \"-\" or \"0\".\n\nOtherwise, print \"?\" to report that the result is uncertain.\n\nExamples\n\nInput\n\n\n3 7 0\n\n\nOutput\n\n\n-\n\nInput\n\n\n2 0 1\n\n\nOutput\n\n\n+\n\nInput\n\n\n1 1 0\n\n\nOutput\n\n\n0\n\nInput\n\n\n0 0 1\n\n\nOutput\n\n\n?\n\nNote\n\nIn the first example, Nauuo would definitely get three upvotes and seven downvotes, so the only possible result is \"-\".\n\nIn the second example, no matter the person unknown downvotes or upvotes, Nauuo would get more upvotes than downvotes. So the only possible result is \"+\".\n\nIn the third example, Nauuo would definitely get one upvote and one downvote, so the only possible result is \"0\".\n\nIn the fourth example, if the only one person upvoted, the result would be \"+\", otherwise, the result would be \"-\". There are two possible results, so the result is uncertain.", "You have a sequence a with n elements 1, 2, 3, ..., k - 1, k, k - 1, k - 2, ..., k - (n - k) (k \u2264 n < 2k).\n\nLet's call as inversion in a a pair of indices i < j such that a[i] > a[j].\n\nSuppose, you have some permutation p of size k and you build a sequence b of size n in the following manner: b[i] = p[a[i]].\n\nYour goal is to find such permutation p that the total number of inversions in b doesn't exceed the total number of inversions in a, and b is lexicographically maximum.\n\nSmall reminder: the sequence of k integers is called a permutation if it contains all integers from 1 to k exactly once.\n\nAnother small reminder: a sequence s is lexicographically smaller than another sequence t, if either s is a prefix of t, or for the first i such that s_i \u2260 t_i, s_i < t_i holds (in the first position that these sequences are different, s has smaller number than t).\n\nInput\n\nThe first line contains a single integer t (1 \u2264 t \u2264 1000) \u2014 the number of test cases.\n\nThe first and only line of each test case contains two integers n and k (k \u2264 n < 2k; 1 \u2264 k \u2264 10^5) \u2014 the length of the sequence a and its maximum.\n\nIt's guaranteed that the total sum of k over test cases doesn't exceed 10^5.\n\nOutput\n\nFor each test case, print k integers \u2014 the permutation p which maximizes b lexicographically without increasing the total number of inversions.\n\nIt can be proven that p exists and is unique.\n\nExample\n\nInput\n\n\n4\n1 1\n2 2\n3 2\n4 3\n\n\nOutput\n\n\n1 \n1 2 \n2 1 \n1 3 2 \n\nNote\n\nIn the first test case, the sequence a = [1], there is only one permutation p = [1].\n\nIn the second test case, the sequence a = [1, 2]. There is no inversion in a, so there is only one permutation p = [1, 2] which doesn't increase the number of inversions.\n\nIn the third test case, a = [1, 2, 1] and has 1 inversion. If we use p = [2, 1], then b = [p[a[1]], p[a[2]], p[a[3]]] = [2, 1, 2] and also has 1 inversion.\n\nIn the fourth test case, a = [1, 2, 3, 2], and since p = [1, 3, 2] then b = [1, 3, 2, 3]. Both a and b have 1 inversion and b is the lexicographically maximum."], "public_tests": [{"input": ["3 7 0\n", "1 1 0\n", "0 0 1\n", "2 0 1\n"], "output": ["-", "0", "?", "+"]}, {"input": ["4\n1 1\n2 2\n3 2\n4 3\n"], "output": ["\n1 \n1 2 \n2 1 \n1 3 2 \n"]}], "private_tests": [{"input": ["100 0 100\n", "80 63 18\n", "25 12 100\n", "80 29 11\n", "10 5 6\n", "94 37 25\n", "98 82 13\n", "21 24 18\n", "1 2 2\n", "88 88 0\n", "73 29 43\n", "58 83 39\n", "97 33 19\n", "1 3 4\n", "100 100 0\n", "62 63 12\n", "99 20 7\n", "21 52 5\n", "43 9 61\n", "45 0 44\n", "7 4 4\n", "100 100 100\n", "34 51 3\n", "0 0 100\n", "3 3 2\n", "34 44 21\n", "87 98 19\n", "60 60 32\n", "22 99 77\n", "28 99 70\n", "33 24 13\n", "79 42 12\n", "48 100 48\n", "58 97 4\n", "52 14 10\n", "12 1 11\n", "5 2 10\n", "93 21 2\n", "8 5 5\n", "58 83 8\n", "97 64 6\n", "49 8 6\n", "13 6 8\n", "82 98 93\n", "7 4 3\n", "37 5 15\n", "100 0 99\n", "21 50 0\n", "0 100 48\n", "5 7 1\n", "42 40 4\n", "36 3 35\n", "8 87 7\n", "21 55 9\n", "0 0 0\n", "78 95 14\n", "0 100 99\n", "25 39 32\n", "89 41 36\n", "82 84 16\n", "25 35 23\n", "47 78 6\n", "42 43 16\n", "1 1 1\n", "1 0 1\n", "43 93 9\n", "3 4 5\n", "92 93 10\n", "0 87 13\n", "1 50 50\n", "100 0 48\n", "13 1 13\n", "19 90 4\n", "2 2 1\n", "98 44 17\n", "2 1 3\n", "2 82 17\n", "40 51 11\n", "83 3 8\n", "96 71 19\n", "62 56 5\n", "21 31 14\n", "50 100 50\n", "0 100 0\n", "96 55 0\n", "26 92 6\n", "6 5 4\n", "97 71 36\n", "74 2 16\n", "66 27 9\n", "47 40 10\n", "7 3 5\n", "3 2 3\n", "5 1 6\n", "86 1 0\n", "46 1 89\n", "5 3 3\n", "60 33 15\n", "4 3 1\n", "12 89 2\n", "5 5 3\n", "9 8 2\n", "100 48 48\n", "97 78 2\n", "1 2 7\n", "2 87 10\n", "15 4 15\n", "58 58 1\n", "5 3 2\n", "100 50 50\n"], "output": ["?", "?", "?", "+", "?", "+", "+", "?", "?", "0", "+", "?", "+", "?", "0", "?", "+", "-", "?", "+", "?", "?", "-", "?", "?", "?", "?", "?", "?", "-", "?", "+", "-", "-", "+", "?", "?", "+", "?", "-", "+", "+", "?", "?", "?", "+", "+", "-", "-", "-", "?", "?", "-", "-", "0", "-", "-", "?", "+", "?", "?", "-", "?", "?", "?", "-", "?", "?", "-", "?", "+", "?", "-", "?", "+", "?", "-", "?", "+", "+", "+", "?", "?", "-", "+", "-", "?", "?", "+", "+", "?", "?", "?", "?", "+", "?", "?", "+", "?", "-", "?", "?", "+", "+", "?", "-", "?", "?", "?", "?"]}, {"input": [], "output": []}], "transformed_sc": ["x, y, z = map(int, input().split())\nif (x+z) == y and (z+y) == x:\n    print(\"0\")\nelif (x+z) >= y and (z+y) >= x:\n    print(\"?\")\nelif x > y or (x+z) > y and (y+z) < x:\n    print(\"+\")\nelif y > x or (y+z) > x and (x+z) < y:\n    print(\"-\")", "test_cases = int(input())\nfor _ in range(test_cases):\n    n, k = map(int, input().split())\n    r = [y + 1 for y in range(k)]\n    sequence = []\n    h = 1\n    t = 0\n    for _ in range(n):\n        if t == 0:\n            sequence.append(h)\n            h += 1\n            if h > k:\n                h = k - 1\n                t = -1\n        else:\n            sequence.append(h)\n            h -= 1\n            if h <= 0:\n                h = 1\n                t = 0\n\n    freq_map = {}\n    for num in sequence:\n        if num not in freq_map:\n            freq_map[num] = 1\n        else:\n            freq_map[num] += 1\n\n    p = max(freq_map.keys())\n    for num, freq in freq_map.items():\n        if freq > 1:\n            p = num - 1\n            break\n            \n    z = r[p:]\n    z.sort(reverse=True)\n    result = r[:p] + z\n\n    print(*result)"], "sc": ["x,y,z=input().split()\nx=int(x)\ny=int(y)\nz=int(z)\nif (x+z)==y and (z+y)==x :\n    print(\"0\")\nelif (x+z)>=y and (z+y)>=x:\n    print(\"?\")\nelif x>y or (x+z)>y and (y+z)<x:\n    print(\"+\")\nelif y>x or (y+z)>x and (x+z)<y:\n    print(\"-\")\n\n\n    \n    ", "a = int(input())\nfor x in range(a):\n    b,c = map(int,input().split())\n    r = [ y+1 for y in range(c)]\n    d = []\n    h = 1\n    t = 0\n    for x in range(b):\n        if t == 0:\n\n            d.append(h)\n            h += 1\n\n            if h > c:\n                h = c-1\n                t = -1\n\n        else:\n            d.append(h)\n            h -= 1\n\n            if h <= 0:\n                h = 1\n                t = 0\n\n    n = {}\n    for y in d:\n        if n.get(y) == None:\n            n[y] = 1\n        else:\n            n[y] += 1\n\n    p = c\n    for y in n:\n        if n[y] > 1:\n            p = y-1\n            break\n\n    z = r[p:]\n    z.sort(reverse=True)\n    ss = (r[:p]+z)\n    print(*ss)\n\n"], "sc_cc": [11.0, 11.0], "transformed_mc": ["\ndef determine_vote_result(x, y, z):\n    if (x+z) == y and (z+y) == x:\n        return \"0\"\n    elif (x+z) >= y and (z+y) >= x:\n        return \"?\"\n    elif x > y or (x+z) > y and (y+z) < x:\n        return \"+\"\n    elif y > x or (y+z) > x and (x+z) < y:\n        return \"-\"\n\ndef main():\n    x, y, z = map(int, input().split())\n    result = determine_vote_result(x, y, z)\n    print(result)\n\nif __name__ == '__main__':\n    main()\n", "def build_sequence(n, k):\n    d = []\n    h = 1\n    t = 0\n    for _ in range(n):\n        if t == 0:\n            d.append(h)\n            h += 1\n            if h > k:\n                h = k - 1\n                t = -1\n        else:\n            d.append(h)\n            h -= 1\n            if h <= 0:\n                h = 1\n                t = 0\n    return d\n\ndef count_frequency(sequence):\n    freq_map = {}\n    for num in sequence:\n        if num not in freq_map:\n            freq_map[num] = 1\n        else:\n            freq_map[num] += 1\n    return freq_map\n\ndef find_p_value(freq_map):\n    p = max(freq_map.keys())\n    for num, freq in freq_map.items():\n        if freq > 1:\n            p = num - 1\n            break\n    return p\n\ndef sort_and_combine(r, p, k):\n    z = r[p:]\n    z.sort(reverse=True)\n    result = r[:p] + z\n    return result\n\ndef find_permutation(n, k):\n    r = [y + 1 for y in range(k)]\n    sequence = build_sequence(n, k)\n    freq_map = count_frequency(sequence)\n    p = find_p_value(freq_map)\n    result = sort_and_combine(r, p, k)\n    return result\n\ndef main():\n    test_cases = int(input())\n    for _ in range(test_cases):\n        n, k = map(int, input().split())\n        result = find_permutation(n, k)\n        print(*result)\n\nif __name__ == '__main__':\n    main()"]}
2 | 


--------------------------------------------------------------------------------
/apps/icl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import argparse
  4 | from tqdm import tqdm
  5 | 
  6 | import numpy as np
  7 | from collections import defaultdict
  8 | 
  9 | import torch
 10 | 
 11 | from datasets import load_dataset, Dataset
 12 | 
 13 | from vllm import LLM, SamplingParams
 14 | 
 15 | from utils import read_jsonl_to_dict, write_dict_to_jsonl, get_avg_cc
 16 | 
 17 | import sys
 18 | 
 19 | 
 20 | def set_seed(seed):
 21 |     random.seed(seed)
 22 |     np.random.seed(seed)
 23 |     torch.manual_seed(seed)
 24 |     torch.cuda.manual_seed(seed)
 25 |     # When running on the CuDNN backend, two further options must be set
 26 |     torch.backends.cudnn.deterministic = True
 27 |     torch.backends.cudnn.benchmark = False
 28 |     # Set a fixed value for the hash seed
 29 |     os.environ["PYTHONHASHSEED"] = str(seed)
 30 | 
 31 | 
 32 | def get_transformed_demonstration(args, data):
 33 |     demonstration = defaultdict(list)
 34 | 
 35 |     for i in range(args.num_icl_shot):
 36 |         if "sc" in args.code_type:
 37 |             instruction = data["sc_instruction"][i]
 38 |         else:
 39 |             instruction = data["mc_instruction"][i]
 40 | 
 41 |         if "transformed" in args.code_type:
 42 |             code = data[args.code_type][i][0].strip()
 43 |         else:
 44 |             code = data[args.code_type][i].strip()
 45 | 
 46 |         demonstration["problem_id"].append(data["problem_id"][i])
 47 |         demonstration["description"].append(data["problem_description"][i].strip())
 48 |         demonstration["instruction"].append(instruction)
 49 |         demonstration["starter_code"].append(data["starter_code"][i])
 50 |         demonstration["code"].append(code)
 51 |         demonstration["code_cc"].append(get_avg_cc(data[args.code_type][i]))
 52 | 
 53 |     return demonstration
 54 | 
 55 | 
 56 | def extract_solution(args, generation):
 57 |     if args.num_icl_shot > 0:
 58 |         start_index = generation.find("```")
 59 |         if start_index == -1:
 60 |             solution = ""
 61 |         else:
 62 |             end_index = generation.find("```", start_index + len("```"))
 63 |             if start_index < end_index:
 64 |                 solution = generation[start_index + len("```") : end_index]
 65 |             else:
 66 |                 solution = ""
 67 | 
 68 |     return solution
 69 | 
 70 | 
 71 | def make_prompt(args, demonstration, test_data):
 72 |     if test_data["starter_code"] == "":
 73 |         question_guide = "read from and write to standard IO"
 74 |     else:
 75 |         question_guide = "use the provided function signature"
 76 | 
 77 |     if "sc" in args.code_type:
 78 |         instruction = (
 79 |             "Write a python code to solve the following coding problem "
 80 |             "that obeys the constraints and passes the example test cases. "
 81 |             f"The output code needs to {question_guide}. "
 82 |             "Please wrap your code answer using ```:"
 83 |         )
 84 |     elif "mc" in args.code_type:
 85 |         instruction = (
 86 |             "Write a python code to solve the following coding problem "
 87 |             "that obeys the constraints and passes the example test cases. "
 88 |             f"The output code needs to {question_guide}. "
 89 |             "Ensure modularity of the python code by dividing the code into smaller, "
 90 |             "useful functions to solve the given problem. "
 91 |             "Please wrap your code answer using ```:"
 92 |         )
 93 | 
 94 |     # instruction of CodeLlama for APPS
 95 |     if "meta-llama/CodeLlama" in args.model:
 96 |         # make zero-shot or few-shot prompt
 97 |         prompt = ""
 98 |         if args.num_icl_shot == 0:
 99 |             assert ()  # not implemented yet
100 |         elif args.num_icl_shot > 0:
101 |             for i in range(args.num_icl_shot):
102 |                 prompt += "Q: " + demonstration["instruction"][i] + "\n"
103 |                 prompt += demonstration["description"][i] + "\n"
104 |                 prompt += demonstration["starter_code"][i] + "\n"
105 |                 prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n"
106 |             prompt += "Q: " + instruction + "\n"
107 |             prompt += test_data["question"] + "\n"
108 |             prompt += test_data["starter_code"] + "\n"
109 |             prompt += "A: "
110 | 
111 |     # instruction of DeepseekCoder for APPS
112 |     elif "deepseek-ai/deepseek-coder" in args.model:
113 |         # make zero-shot or few-shot prompt
114 |         prompt = ""
115 |         if args.num_icl_shot == 0:
116 |             assert ()  # not implemented yet
117 |         elif args.num_icl_shot > 0:
118 |             for i in range(args.num_icl_shot):
119 |                 prompt += demonstration["instruction"][i] + "\n"
120 |                 prompt += "### Instruction:\n" + demonstration["description"][i] + "\n"
121 |                 prompt += demonstration["starter_code"][i] + "\n"
122 |                 prompt += (
123 |                     "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n"
124 |                 )
125 |             prompt += instruction + "\n"
126 |             prompt += "### Instruction:\n" + test_data["question"] + "\n"
127 |             prompt += test_data["starter_code"] + "\n"
128 |             prompt += "### Response:\n"
129 | 
130 |     return prompt
131 | 
132 | 
133 | def main():
134 |     parser = argparse.ArgumentParser()
135 |     parser.add_argument("--seed", type=int, default=42)
136 |     parser.add_argument("--model", type=str, default="meta-llama/CodeLlama-7b-hf")
137 |     parser.add_argument("--num_gpu", type=int, default=1)
138 |     parser.add_argument("--dtype", type=str, default="float16")
139 |     parser.add_argument("--num_icl_shot", type=int, default=2)
140 |     parser.add_argument(
141 |         "--num_gen",
142 |         type=int,
143 |         default=1,
144 |         help="number of solutions generated per problem",
145 |     )
146 |     parser.add_argument("--code_type", type=str, default="sc")
147 |     parser.add_argument(
148 |         "--temperature",
149 |         type=float,
150 |         default=0,
151 |         help="0 means greedy decoding for vllm",
152 |     )
153 |     parser.add_argument("--max_new_token", type=int, default=1024)
154 |     parser.add_argument("--top_p", type=float, default=0.95)
155 |     parser.add_argument(
156 |         "--modify",
157 |         type=str,
158 |         default="original",
159 |         help="modification method of the demonstration code",
160 |     )
161 |     parser.add_argument(
162 |         "--swap_space",
163 |         type=int,
164 |         default=4,
165 |         help="The size (GiB) of CPU memory per GPU to use as swap space",
166 |     )
167 | 
168 |     args = parser.parse_args()
169 | 
170 |     set_seed(args.seed)
171 | 
172 |     base_directory = os.path.dirname(__file__)
173 |     if not os.path.exists(os.path.join(base_directory, "result")):
174 |         os.makedirs(os.path.join(base_directory, "result"))
175 |     file_name = f"{args.model.replace('/', '-')}_{args.code_type}_{args.modify}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
176 | 
177 |     data = Dataset.from_json(
178 |         os.path.join(
179 |             os.path.dirname(__file__),
180 |             "data",
181 |             f"2shot_demonstration_{args.seed}seed_reduced2.json",
182 |         )
183 |     )
184 | 
185 |     demonstration = get_transformed_demonstration(args, data)
186 | 
187 |     # load apps test dataset
188 |     test_dataset = load_dataset("codeparrot/apps", split="test", trust_remote_code=True)
189 |     # filtering for specific platforms
190 |     words = ["codeforces", "atcoder", "codechef"]
191 |     test_dataset = test_dataset.filter(
192 |         lambda x: any(word in x["url"] for word in words)
193 |     )
194 | 
195 |     prompts = []
196 |     for test_data in test_dataset:
197 |         prompt = make_prompt(args, demonstration, test_data)
198 |         prompts.append(prompt)
199 | 
200 |     if os.path.exists(os.path.join(base_directory, "result", file_name)):
201 |         results = read_jsonl_to_dict(os.path.join(base_directory, "result", file_name))
202 |         start_index = len(results)
203 |         if not start_index == len(prompts):
204 |             prompts = prompts[start_index:]
205 |         else:
206 |             print("All problems are already solved.")
207 |             sys.exit()
208 | 
209 |     # load model
210 |     # when initializing VLLM engine, random.seed() is called internally.
211 |     model = LLM(
212 |         model=args.model,
213 |         tensor_parallel_size=args.num_gpu,
214 |         dtype=args.dtype,
215 |         max_model_len=8192,
216 |         swap_space=args.swap_space,
217 |     )
218 |     if "meta-llama/CodeLlama" in args.model:
219 |         stop = ["Q:", "A:"]
220 |     elif "deepseek-ai/deepseek-coder" in args.model:
221 |         stop = ["### Instruction", "### Response"]
222 | 
223 |     sampling_params = SamplingParams(
224 |         n=args.num_gen,
225 |         temperature=args.temperature,
226 |         top_p=args.top_p,
227 |         max_tokens=args.max_new_token,
228 |         stop=stop,
229 |     )
230 | 
231 |     # inference using vllm
232 |     generations = []
233 |     solutions = []
234 |     for idx, prompt in enumerate(tqdm(prompts)):
235 |         outputs = model.generate(
236 |             prompt, sampling_params=sampling_params, use_tqdm=False
237 |         )
238 | 
239 |         for output in outputs:
240 |             # for each input in the prompts, args.gen_num number of outputs are generated
241 |             generations_ = [outs.text for outs in output.outputs]
242 |             assert len(generations_) == args.num_gen
243 |             # extract solution code from generated code
244 |             solutions_ = [
245 |                 extract_solution(args, generation) for generation in generations_
246 |             ]
247 |             # save generated solutions (list)
248 |             generations.append(generations_)
249 |             solutions.append(solutions_)
250 | 
251 |         # save generated solutions
252 |         result = []
253 | 
254 |         result.append(
255 |             {
256 |                 "problem_id": test_dataset[idx]["problem_id"],
257 |                 "description": test_dataset[idx]["question"],
258 |                 "difficulty": test_dataset[idx]["difficulty"],
259 |                 "starter_code": test_dataset[idx]["starter_code"],
260 |                 "generated_solutions": generations_,
261 |                 "extracted_solutions": solutions_,
262 |                 "prompt": prompt,
263 |                 "demonstration": demonstration,
264 |             }
265 |         )
266 | 
267 |         write_dict_to_jsonl(result, os.path.join(base_directory, "result", file_name))
268 | 
269 |     print("program ends.")
270 | 
271 | 
272 | if __name__ == "__main__":
273 |     main()
274 | 


--------------------------------------------------------------------------------
/codecontests/sc2mc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | from utils.utils_evaluate import verify_code_official
  5 | 
  6 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl
  7 | 
  8 | from openai import OpenAI
  9 | 
 10 | import multiprocessing
 11 | 
 12 | 
 13 | problem_description = '''\
 14 | QUESTION:
 15 | Given a permutation $p$ of length $n$, find its subsequence $s_1$, $s_2$, $\ldots$, $s_k$ of length at least $2$ such that:  $|s_1-s_2|+|s_2-s_3|+\ldots+|s_{{k-1}}-s_k|$ is as big as possible over all subsequences of $p$ with length at least $2$.  Among all such subsequences, choose the one whose length, $k$, is as small as possible. 
 16 | 
 17 | If multiple subsequences satisfy these conditions, you are allowed to find any of them.
 18 | 
 19 | A sequence $a$ is a subsequence of an array $b$ if $a$ can be obtained from $b$ by deleting some (possibly, zero or all) elements.
 20 | 
 21 | A permutation of length $n$ is an array of length $n$ in which every element from $1$ to $n$ occurs exactly once.
 22 | 
 23 | 
 24 | -----Input-----
 25 | 
 26 | The first line contains an integer $t$ ($1 \le t \le 2 \cdot 10^4$) — the number of test cases. The description of the test cases follows.
 27 | 
 28 | The first line of each test case contains an integer $n$ ($2 \le n \le 10^5$) — the length of the permutation $p$.
 29 | 
 30 | The second line of each test case contains $n$ integers $p_1$, $p_2$, $\ldots$, $p_{{n}}$ ($1 \le p_i \le n$, $p_i$ are distinct) — the elements of the permutation $p$.
 31 | 
 32 | The sum of $n$ across the test cases doesn't exceed $10^5$.
 33 | 
 34 | 
 35 | -----Output-----
 36 | 
 37 | For each test case, the first line should contain the length of the found subsequence, $k$. The second line should contain $s_1$, $s_2$, $\ldots$, $s_k$ — its elements.
 38 | 
 39 | If multiple subsequences satisfy these conditions, you are allowed to find any of them.
 40 | 
 41 | 
 42 | -----Example-----
 43 | Input
 44 | 2
 45 | 3
 46 | 3 2 1
 47 | 4
 48 | 1 3 4 2
 49 | 
 50 | Output
 51 | 2
 52 | 3 1 
 53 | 3
 54 | 1 4 2 
 55 | 
 56 | 
 57 | 
 58 | -----Note-----
 59 | 
 60 | In the first test case, there are $4$ subsequences of length at least $2$:  $[3,2]$ which gives us $|3-2|=1$.  $[3,1]$ which gives us $|3-1|=2$.  $[2,1]$ which gives us $|2-1|=1$.  $[3,2,1]$ which gives us $|3-2|+|2-1|=2$. 
 61 | 
 62 | So the answer is either $[3,1]$ or $[3,2,1]$. Since we want the subsequence to be as short as possible, the answer is $[3,1]$.\
 63 | '''
 64 | 
 65 | sc = '''\
 66 | ANSWER:
 67 | ```python
 68 | import sys
 69 | for _ in range(int(input())):
 70 |     n = int(input())
 71 |     data = list(map(int, input().split()))
 72 |     ans = [data[0]]
 73 |     for i in range(1, n - 1):
 74 |         if data[i - 1] < data[i] > data[i + 1] or data[i - 1] > data[i] < data[i + 1]:
 75 |             ans += [data[i]]
 76 |     print(len(ans) + 1)
 77 |     print(*ans, data[-1])
 78 | ```\
 79 | '''
 80 | 
 81 | mc = '''\
 82 | ```python
 83 | import sys
 84 | 
 85 | def ii():
 86 |     return sys.stdin.readline().strip()
 87 | 
 88 | def idata():
 89 |     return [int(x) for x in ii().split()]
 90 | 
 91 | def solve_of_problem():
 92 |     n = int(ii())
 93 |     data = idata()
 94 |     ans = [data[0]]
 95 |     for i in range(1, n - 1):
 96 |         if data[i - 1] < data[i] > data[i + 1] or data[i - 1] > data[i] < data[i + 1]:
 97 |             ans += [data[i]]
 98 |     print(len(ans) + 1)
 99 |     print(*ans, data[-1])
100 |     return
101 | 
102 | if __name__ == '__main__':
103 |     for ______ in range(int(ii())):
104 |         solve_of_problem()
105 | ```\
106 | '''
107 | 
108 | sc2mc_instruction = '''\
109 | Refactor the above python program following the question. Follow the guidelines
110 | * make the program more modular with smaller and meaningful helper functions
111 | * good descriptive names for the helper functions
112 | * have an entry function called ‘main()’ 
113 | * 'main()' is called inside 'if __name__ == '__main__''
114 | 
115 | Do not change the original semantics of the program significantly and no need to perform optimizations. \
116 | Enclose the program within backticks as shown above\
117 | '''
118 | 
119 | mc2sc_instruction = '''\
120 | Refactor the above program. Follow the guidelines
121 | * make the program monolithic without helper functions
122 | * transform the program with multiple functions into a single piece of code
123 | * do not copy the given code exactly as it is
124 | * eliminate any modular structures such as separate functions or classes, merging them into a continuous, unified script
125 | 
126 | Do not change the original semantics of the program significantly and no need to perform optimizations. \
127 | Enclose the program within backticks as shown above\
128 | '''
129 | 
130 | sc2mc_demonstration = {
131 |     'problem_description': problem_description,
132 |     'sc': sc,
133 |     'mc': mc,
134 |     'instruction': sc2mc_instruction
135 | }
136 | 
137 | mc2sc_demonstration = {
138 |     'problem_description': problem_description,
139 |     'sc': sc,
140 |     'mc': mc,
141 |     'instruction': mc2sc_instruction
142 | }
143 | 
144 | def make_gpt_chat_messsage(role, content):
145 |     return {'role': role, 'content': content}
146 | 
147 | 
148 | def make_sc2mc_prompt(demonstration, input, shot):
149 |     problem_description = demonstration['problem_description']
150 |     sc = demonstration['sc']
151 |     mc = demonstration['mc']
152 |     instruction = demonstration['instruction']
153 |     
154 |     input_problem_description = input['problem_description']
155 |     input_code = input['code']
156 |     
157 |     messages = []
158 |     
159 |     # zero-shot prompt for sc -> mc
160 |     if shot == 0:
161 |         messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant."))
162 |         messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction))
163 |     # 1 shot prompt for sc -> mc
164 |     else:
165 |         messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant."))
166 |         messages.append(make_gpt_chat_messsage('user', problem_description + '\n' + sc + '\n' + instruction))
167 |         messages.append(make_gpt_chat_messsage('assistant', mc))
168 |         messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction))
169 |     
170 |     return messages
171 | 
172 | 
173 | def make_mc2sc_prompt(demonstration, input, shot):
174 |     problem_description = demonstration['problem_description']
175 |     sc = demonstration['sc']
176 |     mc = demonstration['mc']
177 |     instruction = demonstration['instruction']
178 |     
179 |     input_problem_description = input['problem_description']
180 |     input_code = input['code']
181 |     
182 |     messages = []
183 |     
184 |     # zero-shot prompt for mc -> sc
185 |     if shot == 0:
186 |         messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant."))
187 |         messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction))
188 |     # 1 shot prompt for sc -> mc
189 |     else:
190 |         messages.append(make_gpt_chat_messsage('system', "You are an AI programming assistant."))
191 |         messages.append(make_gpt_chat_messsage('user', problem_description + '\n' + mc + '\n' + instruction))
192 |         messages.append(make_gpt_chat_messsage('assistant', sc))
193 |         messages.append(make_gpt_chat_messsage('user', 'QUESTION:\n' + input_problem_description + '\n' + 'ANSWER:\n```python\n' + input_code + '\n```\n' + instruction))
194 |     
195 |     return messages
196 | 
197 | 
198 | def check_correctness(code, tests):
199 |     GLOBAL_TIMEOUT = 10
200 |     
201 |     def _temp_run(code, tests, result):
202 |         try:
203 |             flag, outcomes = verify_code_official(tests, code)
204 |             result.append(flag)
205 |         except Exception as e:
206 |             pass
207 |         
208 |     manager = multiprocessing.Manager()
209 |     result = manager.list()
210 |     p = multiprocessing.Process(target=_temp_run, args=(code, tests, result))
211 |     p.start()
212 |     p.join(timeout=GLOBAL_TIMEOUT + 1)
213 |     if p.is_alive():
214 |         p.kill()
215 |     if not result:
216 |         result = [-1]
217 |     if result[0] == True:
218 |         return True
219 |     else:
220 |         return False
221 | 
222 | def main():
223 |     # seeds = [27, 42, 101, 134, 169]
224 |     # seeds = [42, 101, 134, 169]
225 |     seeds = [101]
226 |     code_type = 'monolithic'
227 |     client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
228 | 
229 |     for seed in seeds:
230 |         base_directory = os.getcwd()
231 |         file_name = f"{code_type}_2shot_demonstration_{seed}seed.jsonl"
232 |         data = read_jsonl_to_dict(os.path.join(base_directory, 'data', file_name))[0]
233 | 
234 |         transformed_code = []
235 |         passed = []
236 |         # 2 examples are in demonstration
237 |         for i in range(2):    
238 |             problem_description = data['problem_description'][i]
239 |             input_code = data['code'][i]
240 |             input = {'problem_description': problem_description, 'code': input_code}
241 |             messages = make_sc2mc_prompt(sc2mc_demonstration, input, shot=1)
242 |             
243 |             completion = client.chat.completions.create(
244 |                 model="gpt-3.5-turbo",
245 |                 messages=messages,
246 |                 max_tokens=1024,
247 |                 stop=["\n\n\n\n", "####", "----"],
248 |                 temperature=0,
249 |             )
250 |             response = completion.choices[0].message.content
251 |             
252 |             start_index = response.find('```python')
253 |             if start_index != -1:
254 |                 end_index = response.find('```', start_index + len('```python'))
255 |                 if end_index != -1:
256 |                     response = response[start_index + len('```python'): end_index]
257 |                 else:
258 |                     response = response[start_index + len('```python'):]
259 |             transformed_code.append(response)
260 |             
261 |             ## correctness check
262 |             tests = {'inputs': [], 'outputs': []}
263 |             tests['inputs'].extend(data['public_tests'][i]['input'])
264 |             tests['inputs'].extend(data['private_tests'][i]['input'])
265 |             tests['outputs'].extend(data['public_tests'][i]['output'])
266 |             tests['outputs'].extend(data['private_tests'][i]['output'])
267 | 
268 |             if check_correctness(response, tests) == True:
269 |                 print('pass')
270 |                 passed.append(True)
271 |             else:
272 |                 print('not passed')
273 |                 passed.append(False)
274 |                 
275 |         data['transformed_code'] = transformed_code
276 |         data['passed'] = passed
277 |         write_dict_to_jsonl([data], os.path.join(base_directory, 'data', file_name))
278 | main()


--------------------------------------------------------------------------------
/codecontests/icl_corr.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import argparse
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | from collections import defaultdict
  7 | import torch
  8 | from datasets import load_dataset
  9 | from vllm import LLM, SamplingParams
 10 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score
 11 | 
 12 | 
 13 | def set_seed(seed):
 14 |     random.seed(seed)
 15 |     np.random.seed(seed)
 16 |     torch.manual_seed(seed)
 17 |     torch.cuda.manual_seed(seed)
 18 |     # When running on the CuDNN backend, two further options must be set
 19 |     torch.backends.cudnn.deterministic = True
 20 |     torch.backends.cudnn.benchmark = False
 21 |     # Set a fixed value for the hash seed
 22 |     os.environ["PYTHONHASHSEED"] = str(seed)
 23 | 
 24 | 
 25 | def extract_solution(args, generation):
 26 |     if "meta-llama/CodeLlama" in args.model:
 27 |         if args.num_icl_shot == 0:
 28 |             assert ()  # not implemented yet
 29 |         elif args.num_icl_shot > 0:
 30 |             start_index = generation.find("```")
 31 |             if start_index == -1:
 32 |                 solution = ""
 33 |             else:
 34 |                 end_index = generation.find("```", start_index + len("```"))
 35 |                 if start_index < end_index:
 36 |                     solution = generation[start_index + len("```") : end_index]
 37 |                 else:
 38 |                     solution = ""
 39 | 
 40 |     elif "deepseek-ai/deepseek-coder" in args.model:
 41 |         if args.num_icl_shot == 0:
 42 |             assert ()  # not implemented yet
 43 |         elif args.num_icl_shot > 0:
 44 |             start_index = generation.find("```")
 45 |             if start_index == -1:
 46 |                 solution = ""
 47 |             else:
 48 |                 end_index = generation.find("```", start_index + len("```"))
 49 |                 if start_index < end_index:
 50 |                     solution = generation[start_index + len("```") : end_index]
 51 |                 else:
 52 |                     solution = ""
 53 | 
 54 |     return solution
 55 | 
 56 | 
 57 | def make_prompt(args, demonstration, test_data):
 58 |     instruction = (
 59 |         "Write a python code to solve the following coding problem "
 60 |         "that obeys the constraints and passes the example test cases. "
 61 |         "The output code needs to read from and write to standard IO. "
 62 |         "Please wrap your code answer using ```:"
 63 |     )
 64 |     
 65 |     if "meta-llama/CodeLlama" in args.model:
 66 |         # make zero-shot or few-shot prompt
 67 |         prompt = ""
 68 |         if args.num_icl_shot == 0:
 69 |             assert ()  # not implemented yet
 70 |         elif args.num_icl_shot > 0:
 71 |             for i in range(args.num_icl_shot):
 72 |                 prompt += "Q: " + instruction + "\n"
 73 |                 prompt += demonstration["description"][i] + "\n"
 74 |                 prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n"
 75 |             prompt += "Q: " + instruction + "\n"
 76 |             prompt += test_data["description"] + "\n"
 77 |             prompt += "A: "
 78 |     elif "deepseek-ai/deepseek-coder" in args.model:
 79 |         # make zero-shot or few-shot prompt
 80 |         prompt = ""
 81 |         if args.num_icl_shot == 0:
 82 |             assert ()  # not implemented yet
 83 |         elif args.num_icl_shot > 0:
 84 |             prompt += instruction + "\n"
 85 |             for i in range(args.num_icl_shot):
 86 |                 prompt += "### Instruction:\n" + demonstration["description"][i] + "\n"
 87 |                 prompt += (
 88 |                     "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n"
 89 |                 )
 90 |             prompt += "### Instruction:\n" + test_data["description"] + "\n"
 91 |             prompt += "### Response:\n"
 92 | 
 93 |     return prompt
 94 | 
 95 | 
 96 | def main():
 97 |     parser = argparse.ArgumentParser()
 98 |     parser.add_argument("--seed", type=int, required=True, default=42)
 99 |     parser.add_argument(
100 |         "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf"
101 |     )
102 |     parser.add_argument("--num_gpu", type=int, required=True, default=1, help="total number of gpus used")
103 |     parser.add_argument("--dtype", type=str, required=True, default="float16")
104 |     parser.add_argument("--num_icl_shot", type=int, required=True, default=2)
105 |     parser.add_argument(
106 |         "--num_gen",
107 |         type=int,
108 |         required=True,
109 |         default=1,
110 |         help="number of solutions generated per problem",
111 |     )
112 |     parser.add_argument(
113 |         "--temperature",
114 |         type=float,
115 |         required=True,
116 |         default=0,
117 |         help="0 means greedy decoding for vllm",
118 |     )
119 |     parser.add_argument("--max_new_token", type=int, required=True, default=1024)
120 |     parser.add_argument("--top_p", type=float, required=True, default=0.95)
121 |     parser.add_argument(
122 |         "--swap_space",
123 |         type=int,
124 |         required=False,
125 |         default=4,
126 |         help="The size (GiB) of CPU memory per GPU to use as swap space",
127 |     )
128 |     parser.add_argument(
129 |         "--metric",
130 |         type=str,
131 |         required=True,
132 |         default='style',
133 |         help="code metric (e.g., style or modularity)",
134 |     )
135 |     # additional arguments candidiates:
136 |     # max_model_len
137 |     # stop
138 |     # start_token, end_token
139 |     args = parser.parse_args()
140 |     
141 |     # this code is impelemented for only 1-shot ICL
142 |     assert args.num_icl_shot == 1
143 |     
144 |     # load model
145 |     # when initializing VLLM engine, random.seed() is called internally.
146 |     # so, set_seed() should be called after initializing VLLM engine.
147 |     model = LLM(
148 |         model=args.model,
149 |         tensor_parallel_size=args.num_gpu,
150 |         dtype=args.dtype,
151 |         max_model_len=8192,
152 |         swap_space=args.swap_space,
153 |     )
154 |     
155 |     if "meta-llama/CodeLlama" in args.model:
156 |         stop = ["Q:", "A:"]
157 |     elif "deepseek-ai/deepseek-coder" in args.model:
158 |         stop = ["### Instruction", "### Response"]
159 |         
160 |     sampling_params = SamplingParams(
161 |         n=args.num_gen,
162 |         temperature=args.temperature,
163 |         top_p=args.top_p,
164 |         max_tokens=args.max_new_token,
165 |         stop=stop,
166 |     )
167 | 
168 |     # load code contest test dataset
169 |     test_dataset = load_dataset(
170 |         "deepmind/code_contests",
171 |         split="test",
172 |         cache_dir="/data/huggingface/datasets",
173 |     )
174 |         
175 |     # set seed
176 |     set_seed(args.seed)
177 | 
178 |     base_directory = os.path.dirname(__file__)
179 |     
180 |     # demonstration pool constructed by style or modularity
181 |     demonstration_dataset = read_jsonl_to_dict(
182 |         os.path.join(
183 |             base_directory,
184 |             "data",
185 |             f"{args.metric}_demonstration.jsonl",
186 |         )  
187 |     )
188 |     assert len(demonstration_dataset) == 100
189 |     
190 |     # iterate over codes in the demonstration
191 |     # make 1-shot prompt using the code and estimate pass@k
192 |     for code_idx, data in enumerate(demonstration_dataset):
193 |         if data['var_len'] < 5:
194 |             continue
195 |         print(f'average variable length: {data["var_len"]}')
196 |         file_name = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.metric}_{code_idx}code_icl_result.jsonl"
197 |         if not os.path.exists(os.path.join(base_directory, "result", file_name)):
198 |             print(file_name)
199 |             description = data['description']
200 |             code = data['code']
201 |             score_style = data['score_style'] # 'score_pep8', 'score_var', 'score_style'
202 |             score_modularity = data['score_modularity']
203 |             
204 |             # make demonstration for each code (1-shot)
205 |             demonstration = defaultdict(list)
206 |             demonstration['description'].append(description.strip())
207 |             demonstration["code"].append(code.strip())
208 |             demonstration['score_style'].append(score_style)
209 |             demonstration['score_modularity'].append(score_modularity)
210 |             demonstration['var_len'].append(data['var_len'])
211 |             
212 |             # make prompt for each test data
213 |             prompts = []
214 |             # test_dataset = list(test_dataset)[:1] # for test
215 |             for test_data in test_dataset:
216 |                 prompt = make_prompt(args, demonstration, test_data)
217 |                 prompts.append(prompt)
218 | 
219 |             # inference using vllm
220 |             generations = []
221 |             solutions = []
222 |             # generate solution code using vllm
223 |             print(f'<inference with {code_idx}th demonstration code starts>')
224 |             
225 |             outputs = model.generate(
226 |                 prompts, sampling_params=sampling_params, use_tqdm=True
227 |             )
228 |             for output in outputs:
229 |                 # for each input in the prompts, args.gen_num number of outputs are generated
230 |                 generations_ = [outs.text for outs in output.outputs]
231 |                 assert len(generations_) == args.num_gen
232 |                 # extract solution code from generated code
233 |                 solutions_ = [
234 |                     extract_solution(args, generation) for generation in generations_
235 |                 ]
236 |                 # save generated solutions (list)
237 |                 generations.append(generations_)
238 |                 solutions.append(solutions_)
239 | 
240 |             # save generated solutions
241 |             result = []
242 |             for i, test_data in enumerate(test_dataset):
243 |                 result.append(
244 |                     {
245 |                         "name": test_data["name"],
246 |                         "description": test_data["description"],
247 |                         "public_tests": test_data["public_tests"],
248 |                         "private_tests": test_data["private_tests"],
249 |                         "difficulty": test_data["difficulty"],
250 |                         "cf_rating": test_data["cf_rating"], # difficulty level
251 |                         "generated_solutions": generations[i], # list of generated solutions
252 |                         "extracted_solutions": solutions[i],
253 |                         "prompt": prompts[i],
254 |                         "demonstration": demonstration, # contains code and its metric scores
255 |                     }
256 |                 )
257 |                 
258 |             write_dict_to_jsonl(result, os.path.join(base_directory, "result", file_name))
259 |             
260 |     print(f'program ends.')
261 | 
262 | 
263 | if __name__ == "__main__":
264 |     main()
265 | 


--------------------------------------------------------------------------------
/codecontests/data/monolithic_2shot_demonstration_42seed.jsonl:
--------------------------------------------------------------------------------
1 | {"problem_description": ["Pasha loves to send strictly positive integers to his friends. Pasha cares about security, therefore when he wants to send an integer n, he encrypts it in the following way: he picks three integers a, b and c such that l \u2264 a,b,c \u2264 r, and then he computes the encrypted value m = n \u22c5 a + b - c.\n\nUnfortunately, an adversary intercepted the values l, r and m. Is it possible to recover the original values of a, b and c from this information? More formally, you are asked to find any values of a, b and c such that\n\n  * a, b and c are integers, \n  * l \u2264 a, b, c \u2264 r, \n  * there exists a strictly positive integer n, such that n \u22c5 a + b - c = m. \n\nInput\n\nThe first line contains the only integer t (1 \u2264 t \u2264 20) \u2014 the number of test cases. The following t lines describe one test case each.\n\nEach test case consists of three integers l, r and m (1 \u2264 l \u2264 r \u2264 500 000, 1 \u2264 m \u2264 10^{10}). The numbers are such that the answer to the problem exists.\n\nOutput\n\nFor each test case output three integers a, b and c such that, l \u2264 a, b, c \u2264 r and there exists a strictly positive integer n such that n \u22c5 a + b - c = m. It is guaranteed that there is at least one possible solution, and you can output any possible combination if there are multiple solutions.\n\nExample\n\nInput\n\n\n2\n4 6 13\n2 3 1\n\n\nOutput\n\n\n4 6 5\n2 2 3\n\nNote\n\nIn the first example n = 3 is possible, then n \u22c5 4 + 6 - 5 = 13 = m. Other possible solutions include: a = 4, b = 5, c = 4 (when n = 3); a = 5, b = 4, c = 6 (when n = 3); a = 6, b = 6, c = 5 (when n = 2); a = 6, b = 5, c = 4 (when n = 2).\n\nIn the second example the only possible case is n = 1: in this case n \u22c5 2 + 2 - 3 = 1 = m. Note that, n = 0 is not possible, since in that case n is not a strictly positive integer.", "You are given three integers x, y and n. Your task is to find the maximum integer k such that 0 \u2264 k \u2264 n that k mod x = y, where mod is modulo operation. Many programming languages use percent operator % to implement it.\n\nIn other words, with given x, y and n you need to find the maximum possible integer from 0 to n that has the remainder y modulo x.\n\nYou have to answer t independent test cases. It is guaranteed that such k exists for each test case.\n\nInput\n\nThe first line of the input contains one integer t (1 \u2264 t \u2264 5 \u22c5 10^4) \u2014 the number of test cases. The next t lines contain test cases.\n\nThe only line of the test case contains three integers x, y and n (2 \u2264 x \u2264 10^9;~ 0 \u2264 y < x;~ y \u2264 n \u2264 10^9).\n\nIt can be shown that such k always exists under the given constraints.\n\nOutput\n\nFor each test case, print the answer \u2014 maximum non-negative integer k such that 0 \u2264 k \u2264 n and k mod x = y. It is guaranteed that the answer always exists.\n\nExample\n\nInput\n\n\n7\n7 5 12345\n5 0 4\n10 5 15\n17 8 54321\n499999993 9 1000000000\n10 5 187\n2 0 999999999\n\n\nOutput\n\n\n12339\n0\n15\n54306\n999999995\n185\n999999998\n\nNote\n\nIn the first test case of the example, the answer is 12339 = 7 \u22c5 1762 + 5 (thus, 12339 mod 7 = 5). It is obvious that there is no greater integer not exceeding 12345 which has the remainder 5 modulo 7."], "public_tests": [{"input": ["2\n4 6 13\n2 3 1\n"], "output": ["4 5 4\n2 2 3\n"]}, {"input": ["7\n7 5 12345\n5 0 4\n10 5 15\n17 8 54321\n499999993 9 1000000000\n10 5 187\n2 0 999999999\n"], "output": ["12339\n0\n15\n54306\n999999995\n185\n999999998\n"]}], "private_tests": [{"input": ["20\n10 12 43\n25 49 1\n5 7 39\n8 9 44\n16 17 50\n30 40 975\n601 801 1000\n100 102 909\n599 799 1000\n503 997 9\n194 383 5\n90000 100000 709999\n75000 100000 124999\n375000 499999 625001\n375000 500000 624999\n300000 400000 499999\n250000 500000 1\n70000 80000 2272770257\n70000 80000 9999953344\n90000 100000 9999955820\n", "20\n375000 500000 624999\n375000 499999 624997\n375003 499999 624995\n375002 499999 624995\n375001 499999 624996\n375002 499999 624996\n375001 499999 624997\n375000 499999 624991\n375000 499999 624995\n375000 499999 624994\n375000 499999 624993\n375000 499999 624998\n375000 499999 624996\n375000 499999 624992\n375000 499999 624988\n375000 499999 624986\n375000 499999 624982\n375000 499999 624990\n375000 499999 624991\n375000 499999 624989\n", "20\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n250000 500000 1\n", "4\n4 6 12\n1 1 1\n2 2 2\n3 3 3\n", "20\n1 500000 10000000000\n500000 500000 10000000000\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n", "7\n375000 499999 624997\n375000 500000 624999\n375000 499999 624995\n375000 499999 624994\n375000 499999 624993\n375000 499999 624998\n375000 499999 624996\n", "20\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n375000 500000 624999\n", "1\n1 4 10000000000\n"], "output": ["11 10 11\n25 25 49\n5 5 6\n9 8 9\n17 16 17\n35 30 35\n800 801 601\n101 100 100\n599 599 797\n503 503 997\n194 194 383\n100000 99999 90000\n99999 100000 75000\n375000 375000 499999\n499999 500000 375000\n399999 400000 300000\n250000 250000 499999\n70007 70000 76998\n70009 77802 70000\n90003 90000 97501\n", "499999 500000 375000\n499998 499999 375000\n499999 499999 375003\n499998 499999 375002\n499998 499999 375001\n499999 499999 375002\n499999 499999 375001\n499992 499999 375000\n499996 499999 375000\n499995 499999 375000\n499994 499999 375000\n499999 499999 375000\n499997 499999 375000\n499993 499999 375000\n499989 499999 375000\n499987 499999 375000\n499983 499999 375000\n499991 499999 375000\n499992 499999 375000\n499990 499999 375000\n", "250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n250000 250000 499999\n", "4 4 4\n1 1 1\n2 2 2\n3 3 3\n", "1 1 1\n500000 500000 500000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n", "499998 499999 375000\n499999 500000 375000\n499996 499999 375000\n499995 499999 375000\n499994 499999 375000\n499999 499999 375000\n499997 499999 375000\n", "499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n499999 500000 375000\n", "1 1 1\n"]}, {"input": ["1\n31 2 104\n", "1\n43284 1 33424242\n", "1\n943643 1 23522222\n", "1\n33 6 100\n", "1\n1000000000 0 999999999\n", "1\n4452384 1 3573842\n"], "output": ["95\n", "33415249\n", "22647433\n", "72\n", "0\n", "1\n"]}], "transformed_sc": ["\nfor _ in range(int(input())):\n    l, r, m = map(int, input().split())\n    for a in range(l, r + 1):\n        minn = m // a\n        maxn = 0 - -m // a\n        if l - r <= m - minn * a <= r - l and minn > 0:\n            m -= minn * a\n            for b in range(l, r + 1):\n                if -r <= m - b <= -l:\n                    c = b - m\n                    print(a,b,c)\n                    break\n            break\n        if l - r <= m - maxn * a <= r - l:\n            m -= maxn * a\n            for b in range(l, r + 1):\n                if -r <= m - b <= -l:\n                    c = b - m\n                    print(a,b,c)\n                    break\n            break\n                            ", "test_cases = int(input())\nfor _ in range(test_cases):\n    x, y, n = map(int, input().split())\n    if n < x and y == 0:\n        print(0)\n    elif n < x and y == 1:\n        print(1)\n    elif n % x == y:\n        print(n)\n    else:\n        if x == 2 and y == 1 and n == 1:\n            print(1)\n        elif n % x < y:\n            print(n - ((n % x) + (x - y)))\n        else:\n            print(n - ((n % x) - y))"], "sc": ["for _ in range(int(input())):\n  l,r,m=map(int,input().split())\n  for a in range(l,r+1):\n    minn=m//a\n    maxn=0--m//a\n    if l-r<=m-minn*a<=r-l and minn>0:\n      m-=minn*a\n      for b in range(l,r+1):\n        if -r<=m-b<=-l:\n          print(a,b,b-m)\n          break\n      break\n    if l-r<=m-maxn*a<=r-l:\n      m-=maxn*a\n      for b in range(l,r+1):\n        if -r<=m-b<=-l:\n          print(a,b,b-m)\n          break\n      break", "n = int(input())\nfor i in range(n):\n    x,y,n = map(int,input().split())\n    if n < x and y == 0:\n        print(0)\n    elif n < x and y == 1:\n        print(1)\n    elif n%x == y:\n        print(n)\n    else:\n        if x == 2 and y == 1 and n == 1:\n            print(1)\n        elif n%x < y:\n            print(n-((n%x)+(x-y)))\n        else:\n            print(n - ((n%x) - y))"], "sc_cc": [10.0, 11.0], "transformed_mc": ["\ndef find_values(l, r, m):\n    for a in range(l, r + 1):\n        minn = m // a\n        maxn = 0 - -m // a\n        if l - r <= m - minn * a <= r - l and minn > 0:\n            m -= minn * a\n            for b in range(l, r + 1):\n                if -r <= m - b <= -l:\n                    return a, b, b - m\n        if l - r <= m - maxn * a <= r - l:\n            m -= maxn * a\n            for b in range(l, r + 1):\n                if -r <= m - b <= -l:\n                    return a, b, b - m\n\ndef main():\n    for _ in range(int(input())):\n        l, r, m = map(int, input().split())\n        a, b, c = find_values(l, r, m)\n        print(a, b, c)\n\nif __name__ == '__main__':\n    main()\n", "\ndef find_maximum_k(x, y, n):\n    if n < x and y == 0:\n        return 0\n    elif n < x and y == 1:\n        return 1\n    elif n % x == y:\n        return n\n    else:\n        if x == 2 and y == 1 and n == 1:\n            return 1\n        elif n % x < y:\n            return n - ((n % x) + (x - y))\n        else:\n            return n - ((n % x) - y)\n\ndef main():\n    test_cases = int(input())\n    for _ in range(test_cases):\n        x, y, n = map(int, input().split())\n        print(find_maximum_k(x, y, n))\n\nif __name__ == '__main__':\n    main()\n"]}
2 | 


--------------------------------------------------------------------------------
/codecontests/icl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import argparse
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | from collections import defaultdict
  7 | import torch
  8 | from datasets import load_dataset
  9 | from vllm import LLM, SamplingParams
 10 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score
 11 | from utils.utils import get_code_modularity_score
 12 | 
 13 | 
 14 | def set_seed(seed):
 15 |     random.seed(seed)
 16 |     np.random.seed(seed)
 17 |     torch.manual_seed(seed)     
 18 |     torch.cuda.manual_seed(seed)
 19 |     # When running on the CuDNN backend, two further options must be set
 20 |     torch.backends.cudnn.deterministic = True
 21 |     torch.backends.cudnn.benchmark = False
 22 |     # Set a fixed value for the hash seed
 23 |     os.environ["PYTHONHASHSEED"] = str(seed)
 24 | 
 25 | 
 26 | def extract_solution(args, generation):
 27 |     if "CodeLlama" in args.model:
 28 |         start_index = generation.find("```")
 29 |         if start_index == -1:
 30 |             solution = ""
 31 |         else:
 32 |             end_index = generation.find("```", start_index + len("```"))
 33 |             if start_index < end_index:
 34 |                 solution = generation[start_index + len("```") : end_index]
 35 |             else:
 36 |                 solution = ""
 37 | 
 38 |     elif "deepseek"  in args.model:
 39 |         start_index = generation.find("```")
 40 |         if start_index == -1:
 41 |             solution = ""
 42 |         else:
 43 |             end_index = generation.find("```", start_index + len("```"))
 44 |             if start_index < end_index:
 45 |                 solution = generation[start_index + len("```") : end_index]
 46 |             else:
 47 |                 solution = ""
 48 | 
 49 |     return solution
 50 | 
 51 | 
 52 | def make_prompt(args, demonstration, test_data):
 53 |     if 'monolithic' in args.code_type:
 54 |         instruction = (
 55 |             "Write a python code to solve the following coding problem "
 56 |             "that obeys the constraints and passes the example test cases. "
 57 |             "The output code needs to read from and write to standard IO. "
 58 |             "Please wrap your code answer using ```:"
 59 |         )
 60 |     elif 'modular' in args.code_type:
 61 |         instruction = (
 62 |             "Write a python code to solve the following coding problem "
 63 |             "that obeys the constraints and passes the example test cases. "
 64 |             "The output code needs to read from and write to standard IO. "
 65 |             "Ensure modularity of the python code by dividing the code into smaller, "
 66 |             "useful functions to solve the given problem. "
 67 |             "Please wrap your code answer using ```:"
 68 |         )        
 69 |     
 70 |     if "CodeLlama" in args.model:
 71 |         # make zero-shot or few-shot prompt
 72 |         prompt = ""
 73 |         for i in range(args.num_icl_shot):
 74 |             prompt += "Q: " + instruction + "\n"
 75 |             prompt += demonstration["description"][i] + "\n"
 76 |             prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n"
 77 |         prompt += "Q: " + instruction + "\n"
 78 |         prompt += test_data["description"] + "\n"
 79 |         prompt += "A: "
 80 |     elif "deepseek" in args.model:
 81 |         # make zero-shot or few-shot prompt
 82 |         prompt = ""
 83 |         prompt += instruction + "\n"
 84 |         for i in range(args.num_icl_shot):
 85 |             prompt += "### Instruction:\n" + demonstration["description"][i] + "\n"
 86 |             prompt += (
 87 |                 "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n"
 88 |             )
 89 |         prompt += "### Instruction:\n" + test_data["description"] + "\n"
 90 |         prompt += "### Response:\n"
 91 | 
 92 |     return prompt
 93 | 
 94 | 
 95 | def extract_demonstration(train_dataset, shot, code_type):
 96 |     if 'transformed' not in code_type:
 97 |         problem_index_with_both_sc_and_mc = []
 98 |         for i, data in enumerate(train_dataset):
 99 |             num_sc = len(data['monolithic_codes']['monolithic_code'])
100 |             num_mc = len(data['modular_codes']['modular_code']) 
101 |             if num_sc > 0 and num_mc > 0:
102 |                 problem_index_with_both_sc_and_mc.append(i)
103 | 
104 |         demonstration = defaultdict(list)
105 |         for i in random.sample(problem_index_with_both_sc_and_mc, shot):
106 |             data = train_dataset[i]
107 |             # modularity check
108 |             # print(f'problem {i}')
109 |             # tmp = []
110 |             # for code in data['modular_codes']['modular_code']:
111 |                 # modularity = get_code_modularity_score(code)
112 |                 # tmp.append(modularity)
113 |             # print(tmp)        
114 |             if code_type == 'monolithic':
115 |                 demonstration['description'].append(data['problem_description'].strip())
116 |                 demonstration['code'].append(data['monolithic_codes']['monolithic_code'][0].strip()) # pick the first code
117 |                 # print(get_code_modularity_score(data['monolithic_codes']['monolithic_code'][0]))
118 |             elif code_type == 'modular':
119 |                 demonstration['description'].append(data['problem_description'].strip())
120 |                 demonstration['code'].append(data['modular_codes']['modular_code'][0].strip())
121 |                 print(get_code_modularity_score(data['modular_codes']['modular_code'][0]))
122 |                 print(data['modular_codes']['modular_code'][0])
123 | 
124 |         return demonstration
125 |     
126 |     else:
127 |         if code_type == 'transformed_modular':
128 |             key = 'transformed_mc'
129 |         elif code_type == 'transformed_monolithic':
130 |             key = 'transformed_sc'
131 | 
132 |         demonstration = defaultdict(list)
133 |         for i in range(shot):
134 |             demonstration['description'].append(dataset['problem_description'][i].strip())
135 |             demonstration['code'].append(dataset[key][i].strip())
136 | 
137 |         return demonstration
138 | 
139 | 
140 | def main():
141 |     parser = argparse.ArgumentParser()
142 |     parser.add_argument("--seed", type=int, required=True, default=42)
143 |     parser.add_argument(
144 |         "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf"
145 |     )
146 |     parser.add_argument("--num_gpu", type=int, required=True, default=1, help="total number of gpus used")
147 |     parser.add_argument("--dtype", type=str, required=True, default="float16")
148 |     parser.add_argument("--num_icl_shot", type=int, required=True, default=2)
149 |     parser.add_argument(
150 |         "--num_gen",
151 |         type=int,
152 |         required=True,
153 |         default=1,
154 |         help="number of solutions generated per problem",
155 |     )
156 |     parser.add_argument(
157 |         "--temperature",
158 |         type=float,
159 |         required=True,
160 |         default=0,
161 |         help="0 means greedy decoding for vllm",
162 |     )
163 |     parser.add_argument("--max_new_token", type=int, required=True, default=1024)
164 |     parser.add_argument("--top_p", type=float, required=True, default=0.95)
165 |     parser.add_argument(
166 |         "--swap_space",
167 |         type=int,
168 |         required=False,
169 |         default=4,
170 |         help="The size (GiB) of CPU memory per GPU to use as swap space",
171 |     )
172 |     parser.add_argument('--code_type', type=str, required=True, default='monolithic')
173 |     # additional arguments candidiates:
174 |     # max_model_len
175 |     # stop
176 |     # start_token, end_token
177 |     args = parser.parse_args()
178 |     
179 |     # load model
180 |     # when initializing VLLM engine, random.seed() is called internally.
181 |     # so, set_seed() should be called after initializing VLLM engine.
182 |     model = LLM(
183 |         model=args.model,
184 |         tensor_parallel_size=args.num_gpu,
185 |         dtype=args.dtype,
186 |         max_model_len=8192,
187 |         swap_space=args.swap_space,
188 |     )
189 |     
190 |     if "CodeLlama" in args.model:
191 |         stop = ["Q:", "A:"]
192 |     elif "deepseek" in args.model:
193 |         stop = ["### Instruction", "### Response"]
194 |         
195 |     sampling_params = SamplingParams(
196 |         n=args.num_gen,
197 |         temperature=args.temperature,
198 |         top_p=args.top_p,
199 |         max_tokens=args.max_new_token,
200 |         stop=stop,
201 |     )
202 | 
203 |     # load code contest test dataset
204 |     test_dataset = load_dataset(
205 |         "deepmind/code_contests",
206 |         split="test",
207 |     )
208 |         
209 |     # set seed
210 |     set_seed(args.seed)
211 | 
212 |     base_directory = os.path.dirname(__file__)
213 |     
214 |     # monolithic(sc) or modular(mc) demonstration
215 |     if 'transformed' not in args.code_type: 
216 |         dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', 'my_code_contests_divided_train.jsonl'))
217 |     # transformed monolithic(tsc) or transformed modular(tmc) demonstration
218 |     else:
219 |         dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', f'monolithic_2shot_demonstration_{args.seed}seed.jsonl'))[0]
220 |     
221 |     demonstration = extract_demonstration(dataset, args.num_icl_shot, args.code_type)
222 |         
223 |     file_name = f"{args.model.replace('/', '-')}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
224 |     
225 |     if os.path.exists(os.path.join(base_directory, "result", file_name)):
226 |         print(f'{file_name} already exists.')
227 |         return
228 |         
229 |     # make prompt for each test data
230 |     prompts = []
231 |     # test_dataset = list(test_dataset)[:5] # for test
232 |     for test_data in test_dataset:
233 |         prompt = make_prompt(args, demonstration, test_data)
234 |         prompts.append(prompt)
235 |         
236 |     # inference using vllm
237 |     generations = []
238 |     solutions = []
239 |     
240 |     # generate solution code using vllm
241 |     outputs = model.generate(
242 |         prompts, sampling_params=sampling_params, use_tqdm=True
243 |     )
244 |     for output in outputs:
245 |         # for each input in the prompts, args.gen_num number of outputs are generated
246 |         generations_ = [outs.text for outs in output.outputs]
247 |         assert len(generations_) == args.num_gen
248 |         # extract solution code from generated code
249 |         solutions_ = [
250 |             extract_solution(args, generation) for generation in generations_
251 |         ]
252 |         # save generated solutions (list)
253 |         generations.append(generations_)
254 |         solutions.append(solutions_)
255 |         
256 |     # save generated solutions
257 |     result = []
258 |     for i, test_data in enumerate(test_dataset):
259 |         result.append(
260 |             {
261 |                 "name": test_data["name"],
262 |                 "description": test_data["description"],
263 |                 "public_tests": test_data["public_tests"],
264 |                 "private_tests": test_data["private_tests"],
265 |                 "difficulty": test_data["difficulty"],
266 |                 "cf_rating": test_data["cf_rating"], # difficulty level
267 |                 "generated_solutions": generations[i], # list of generated solutions
268 |                 "extracted_solutions": solutions[i],
269 |                 "prompt": prompts[i],
270 |                 "demonstration": demonstration, # contains code and its description
271 |             }
272 |         )
273 |         
274 |     write_dict_to_jsonl(result, os.path.join(base_directory, "result", file_name))
275 |     print(f'program ends.')
276 | 
277 | 
278 | if __name__ == "__main__":
279 |     main()
280 | 


--------------------------------------------------------------------------------
/apps/data/2shot_demonstration_27seed.json:
--------------------------------------------------------------------------------
1 | {"problem_id":1596,"problem_description":"During quarantine chef\u2019s friend invented a game. In this game there are two players, player 1 and Player 2. In center of garden there is one finish circle and both players are at different distances respectively $X$ and $Y$ from finish circle.\nBetween finish circle and Player 1 there are $X$ number of circles and between finish circle and Player 2 there are $Y$ number of circles. Both player wants to reach finish circle with minimum number of jumps. Player can jump one circle to another circle.\nBoth players can skip $2^0-1$ or $2^1- 1$ or \u2026. or $2^N-1$ circles per jump. A player cannot skip same number of circles in a match more than once. If both players uses optimal way to reach finish circle what will be the difference of minimum jumps needed to reach finish circle by both players. \nIf both players reach finish circle with same number of jumps answer will be $0$ $0$.\n\n-----Input:-----\n- The first line of the input contains a single integer $T$ denoting the number of test cases. The \ndescription of $T$ test cases follows.\n- The first line of each test case contains 2 space separated integers $X$ and $Y$.\n\n-----Output:-----\nFor each test case, print a single line containing 2 space-separated integers which player win and what is the difference between number of minimum jump required by both players to reach finish circle.\n\n-----Constraints-----\n- $1 \\leq T \\leq 10^5$\n- $1 \\leq X,Y \\leq 2*10^7$\n\n-----Sample Input:-----\n2\n4 5\n3 5\n\n-----Sample Output:-----\n0 0\n1 1\n\n-----Explanation:-----\nTest Case 1:\n\nTest Case 2:","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"import math\n\nfor i in range(int(input())):\n p,q=list(map(int,input().split()))\n c=0\n h=0\n \n while(q>=0):\n  if(q==0):\n   h+=1\n   break\n  \n  d=int(math.log2(q+1))\n  if(d==0):\n   h+=1\n   break\n  y=(2**d)-1\n  q-=y+1\n  if(q==-1):\n   h+=1\n   break\n  h+=1\n  \n while(p>=0):\n  if(p==0):\n   c+=1\n   break\n  else:\n   rem=int(math.log2(p+1))\n  \n   if(rem==0):\n    c+=1\n    break\n   \n   y=(2**rem)-1\n   p-=y+1\n   if(p==-1):\n    c+=1\n    break\n   c+=1\n\n if(c==h):\n  print(0,0)\n if(c<h):\n  print(1,h-c)\n if(c>h):\n  print(2,c-h)","sc_cc":13.0,"mc":"takeArr = lambda: list(map(int,input().split()))\ntakeList = lambda: list(map(int,input().split()))\nimport sys\nsys.setrecursionlimit(10**6)\n\n\n\nfrom math import floor,ceil,log2 \ndef powOfPositive(n) :  \n pos = floor(log2(n));  \n return 2**pos;  \ndef powOfNegative(n) : \n pos = ceil(log2(n));  \n return (-1 * pow(2, pos));   \ndef highestPowerOf2(n) :  \n if (n > 0) : \n  return powOfPositive(n);  \n else :   \n  n = -n;  \n  return powOfNegative(n);  \ndef main(t):\n x,y = takeArr()\n a,b = x+1,y+1\n sa = sb = 0\n while a:\n  a -= highestPowerOf2(a)\n  sa += 1\n while b:\n  b -= highestPowerOf2(b)\n  sb += 1\n \n winner = 2 if sa>sb else 1 if sb>sa else 0\n score = abs(sa-sb) if winner else 0\n print(winner,score)\n if t>1:\n  main(t-1)\nmain(int(input()))","mc_cc":2.4,"transformed_mc":["\nimport math\n\ndef calculate_jumps(distance):\n    jumps = 0\n    while distance >= 0:\n        if distance == 0:\n            jumps += 1\n            break\n\n        power = int(math.log2(distance + 1))\n        if power == 0:\n            jumps += 1\n            break\n        step = (2 ** power) - 1\n        distance -= step + 1\n        if distance == -1:\n            jumps += 1\n            break\n        jumps += 1\n\n    return jumps\n\ndef determine_winner_and_difference(player1_distance, player2_distance):\n    player1_jumps = calculate_jumps(player1_distance)\n    player2_jumps = calculate_jumps(player2_distance)\n\n    if player1_jumps == player2_jumps:\n        return 0, 0\n    elif player1_jumps < player2_jumps:\n        return 1, player2_jumps - player1_jumps\n    else:\n        return 2, player1_jumps - player2_jumps\n\ndef main():\n    for _ in range(int(input())):\n        player1_distance, player2_distance = map(int, input().split())\n        winner, difference = determine_winner_and_difference(player1_distance, player2_distance)\n        print(winner, difference)\n\nif __name__ == '__main__':\n    main()\n"],"transformed_sc":["import math\n\nfor _ in range(int(input())):\n    player1_distance, player2_distance = map(int, input().split())\n    \n    jumps = 0\n    distance = player1_distance\n    while distance >= 0:\n        if distance == 0:\n            jumps += 1\n            break\n        power = int(math.log2(distance + 1))\n        if power == 0:\n            jumps += 1\n            break\n        step = (2 ** power) - 1\n        distance -= step + 1\n        if distance == -1:\n            jumps += 1\n            break\n        jumps += 1\n    player1_jumps = jumps\n    \n    jumps = 0\n    distance = player2_distance\n    while distance >= 0:\n        if distance == 0:\n            jumps += 1\n            break\n        power = int(math.log2(distance + 1))\n        if power == 0:\n            jumps += 1\n            break\n        step = (2 ** power) - 1\n        distance -= step + 1\n        if distance == -1:\n            jumps += 1\n            break\n        jumps += 1\n    player2_jumps = jumps\n    \n    if player1_jumps == player2_jumps:\n        winner = 0\n        difference = 0\n    elif player1_jumps < player2_jumps:\n        winner = 1\n        difference = player2_jumps - player1_jumps\n    else:\n        winner = 2\n        difference = player1_jumps - player2_jumps\n    \n    print(winner, difference)"]}
2 | {"problem_id":2348,"problem_description":"N hotels are located on a straight line. The coordinate of the i-th hotel (1 \\leq i \\leq N) is x_i.\nTak the traveler has the following two personal principles:\n - He never travels a distance of more than L in a single day.\n - He never sleeps in the open. That is, he must stay at a hotel at the end of a day.\nYou are given Q queries. The j-th (1 \\leq j \\leq Q) query is described by two distinct integers a_j and b_j.\nFor each query, find the minimum number of days that Tak needs to travel from the a_j-th hotel to the b_j-th hotel following his principles.\nIt is guaranteed that he can always travel from the a_j-th hotel to the b_j-th hotel, in any given input.\n\n-----Constraints-----\n - 2 \\leq N \\leq 10^5\n - 1 \\leq L \\leq 10^9\n - 1 \\leq Q \\leq 10^5\n - 1 \\leq x_i < x_2 < ... < x_N \\leq 10^9\n - x_{i+1} - x_i \\leq L\n - 1 \\leq a_j,b_j \\leq N\n - a_j \\neq b_j\n - N,\\,L,\\,Q,\\,x_i,\\,a_j,\\,b_j are integers.\n\n-----Partial Score-----\n - 200 points will be awarded for passing the test set satisfying N \\leq 10^3 and Q \\leq 10^3.\n\n-----Input-----\nThe input is given from Standard Input in the following format:\nN\nx_1 x_2 ... x_N\nL\nQ\na_1 b_1\na_2 b_2\n:\na_Q b_Q\n\n-----Output-----\nPrint Q lines.\nThe j-th line (1 \\leq j \\leq Q) should contain the minimum number of days that Tak needs to travel from the a_j-th hotel to the b_j-th hotel.\n\n-----Sample Input-----\n9\n1 3 6 13 15 18 19 29 31\n10\n4\n1 8\n7 3\n6 7\n8 5\n\n-----Sample Output-----\n4\n2\n1\n2\n\nFor the 1-st query, he can travel from the 1-st hotel to the 8-th hotel in 4 days, as follows:\n - Day 1: Travel from the 1-st hotel to the 2-nd hotel. The distance traveled is 2.\n - Day 2: Travel from the 2-nd hotel to the 4-th hotel. The distance traveled is 10.\n - Day 3: Travel from the 4-th hotel to the 7-th hotel. The distance traveled is 6.\n - Day 4: Travel from the 7-th hotel to the 8-th hotel. The distance traveled is 10.","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"import bisect\nimport sys\ninput = sys.stdin.readline\nn = int(input())\na = list(map(int,input().split()))\nd = int(input())\ngraph = [[0 for i in range(n+1)] for j in range(18)]\nfor i in range(n):\n  x = bisect.bisect_right(a,a[i]+d)\n  graph[0][i+1] = x\nfor j in range(1,18):\n  for i in range(n):\n    t = graph[j-1][i+1]\n    graph[j][i+1] = graph[j-1][t]\nq = int(input())\nfor _ in range(q):\n  x,y = map(int,input().split())\n  x,y = min(x,y),max(x,y)\n  ans = 0\n  for j in range(18)[::-1]:\n    if graph[j][x] < y:\n      ans += 2**j\n      x = graph[j][x]\n    if j == 0 and x < y:\n      ans += 1\n  print(ans)","sc_cc":11.0,"mc":"def reachN(i0, n):\n    if n == 0:\n        return i0\n    maxbit = ceil(log2(n))\n    kL = []\n    for i in range(maxbit+1):\n        if n>>i & 1:\n            kL.append(i)\n    \n    i = i0\n    for k in kL[::-1]:\n        i = dp[k][i]      \n    return i   \n\ndef fun(n):\n    return reachN(start, n) >= goal\n    \ndef binMin(l, r):\n    if r-l == 1:\n        return r\n    m = (l+r) \/\/ 2\n    if fun(m):\n        r = m\n    else:\n        l = m\n    return binMin(l, r)\n\nimport bisect\nfrom math import log2, ceil\n\nN = int(input())\nxL = [-float('inf')] + [int(i) for i in input().split()]   \nLmax = int(input())\n\nkmax = ceil(log2(len(xL)-2))                               \ndp = [[len(xL)-1]*(N+1) for _ in range(kmax+1)]\nfor i in range(1, N+1):\n    dp[0][i] = bisect.bisect_right(xL, xL[i]+Lmax) - 1\n\nfor k in range(1, kmax+1):\n    for i in range(1, N):\n        dp[k][i] = dp[k-1][dp[k-1][i]]\n\n\nQ = int(input())\nfor _ in range(Q):\n    start, goal = list(map(int, input().split()))\n    if start > goal:\n        start, goal = goal, start\n    ans = binMin(0, goal-start)\n    print(ans)","mc_cc":4.25,"transformed_mc":["\nimport bisect\nimport sys\n\ndef hotel_distances(n, a, d):\n    graph = [[0 for i in range(n+1)] for j in range(18)]\n    for i in range(n):\n        x = bisect.bisect_right(a, a[i] + d)\n        graph[0][i+1] = x\n    for j in range(1, 18):\n        for i in range(n):\n            t = graph[j-1][i+1]\n            graph[j][i+1] = graph[j-1][t]\n    return graph\n\ndef calculate_days_to_travel(graph, x, y):\n    x, y = min(x, y), max(x, y)\n    days = 0\n    for j in range(18)[::-1]:\n        if graph[j][x] < y:\n            days += 2**j\n            x = graph[j][x]\n        if j == 0 and x < y:\n            days += 1\n    return days\n\ndef main():\n    n = int(input())\n    a = list(map(int, input().split()))\n    d = int(input())\n    graph = hotel_distances(n, a, d)\n    q = int(input())\n    for _ in range(q):\n        x, y = map(int, input().split())\n        days = calculate_days_to_travel(graph, x, y)\n        print(days)\n\nif __name__ == '__main__':\n    main()\n"],"transformed_sc":["import bisect\nimport sys\n\ndef hotel_distances(n, a, d):\n    graph = [[0 for i in range(n+1)] for j in range(18)]\n    for i in range(n):\n        x = bisect.bisect_right(a, a[i] + d)\n        graph[0][i+1] = x\n    for j in range(1, 18):\n        for i in range(n):\n            t = graph[j-1][i+1]\n            graph[j][i+1] = graph[j-1][t]\n    return graph\n\ndef calculate_days_to_travel(graph, x, y):\n    x, y = min(x, y), max(x, y)\n    days = 0\n    for j in range(18)[::-1]:\n        if graph[j][x] < y:\n            days += 2**j\n            x = graph[j][x]\n        if j == 0 and x < y:\n            days += 1\n    return days\n\nn = int(input())\na = list(map(int, input().split()))\nd = int(input())\ngraph = [[0 for i in range(n+1)] for j in range(18)]\nfor i in range(n):\n    x = bisect.bisect_right(a, a[i] + d)\n    graph[0][i+1] = x\nfor j in range(1, 18):\n    for i in range(n):\n        t = graph[j-1][i+1]\n        graph[j][i+1] = graph[j-1][t]\n\nq = int(input())\nfor _ in range(q):\n    x, y = map(int, input().split())\n    x, y = min(x, y), max(x, y)\n    days = 0\n    for j in range(18)[::-1]:\n        if graph[j][x] < y:\n            days += 2**j\n            x = graph[j][x]\n        if j == 0 and x < y:\n            days += 1\n    print(days)"]}
3 | 


--------------------------------------------------------------------------------
/codecontests/icl_ft.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import argparse
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | from collections import defaultdict
  7 | import torch
  8 | from datasets import load_dataset
  9 | from vllm import LLM, SamplingParams
 10 | from utils.utils import read_jsonl_to_dict, write_dict_to_jsonl, get_code_style_score
 11 | from utils.utils import get_code_modularity_score
 12 | 
 13 | 
 14 | def set_seed(seed):
 15 |     random.seed(seed)
 16 |     np.random.seed(seed)
 17 |     torch.manual_seed(seed)     
 18 |     torch.cuda.manual_seed(seed)
 19 |     # When running on the CuDNN backend, two further options must be set
 20 |     torch.backends.cudnn.deterministic = True
 21 |     torch.backends.cudnn.benchmark = False
 22 |     # Set a fixed value for the hash seed
 23 |     os.environ["PYTHONHASHSEED"] = str(seed)
 24 | 
 25 | 
 26 | def extract_solution(args, generation):
 27 |     if "CodeLlama" in args.model:
 28 |         # start_index = generation.find("```")
 29 |         # if start_index == -1:
 30 |         #     solution = ""
 31 |         # else:
 32 |         #     end_index = generation.find("```", start_index + len("```"))
 33 |         #     if start_index < end_index:
 34 |         #         solution = generation[start_index + len("```") : end_index]
 35 |         #     else:
 36 |         #         solution = ""
 37 |         idx = generation.find('```')
 38 |         if idx != -1:
 39 |             solution = generation[:idx]
 40 |         else:
 41 |             solution = generation.strip()
 42 | 
 43 |     elif "deepseek"  in args.model:
 44 |         idx = generation.find('```')
 45 |         if idx != -1:
 46 |             solution = generation[:idx]
 47 |         else:
 48 |             solution = generation.strip()
 49 | 
 50 |     return solution
 51 | 
 52 | 
 53 | def make_prompt(args, demonstration, test_data):
 54 |     instruction = (
 55 |         "Write a python code to solve the following coding problem "
 56 |         "that obeys the constraints and passes the example test cases. "
 57 |         "The output code needs to read from and write to standard IO. "
 58 |         "Please wrap your code answer using ```:"
 59 |     )
 60 |     
 61 |     if "CodeLlama" in args.model:
 62 |         # make zero-shot or few-shot prompt
 63 |         prompt = ""
 64 |         for i in range(args.num_icl_shot):
 65 |             prompt += "Q: " + instruction + "\n"
 66 |             prompt += demonstration["description"][i] + "\n"
 67 |             prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n"
 68 |         prompt += "Q: " + instruction + "\n"
 69 |         prompt += test_data["description"].strip() + "\n"
 70 |         prompt += "A: ```"
 71 |     elif "deepseek" in args.model:
 72 |         prompt = ""
 73 |         for i in range(args.num_icl_shot):
 74 |             prompt += "Q: " + instruction + "\n"
 75 |             prompt += demonstration["description"][i] + "\n"
 76 |             prompt += "A: " + "```" + demonstration["code"][i] + "```" + "\n"
 77 |         prompt += "Q: " + instruction + "\n"
 78 |         prompt += test_data["description"].strip() + "\n"
 79 |         prompt += "A: ```"
 80 |         
 81 |         # # make zero-shot or few-shot prompt
 82 |         # prompt = ""
 83 |         # prompt += instruction + "\n"
 84 |         # for i in range(args.num_icl_shot):
 85 |         #     prompt += "### Instruction:\n" + demonstration["description"][i] + "\n"
 86 |         #     prompt += (
 87 |         #         "### Response:\n" + "```" + demonstration["code"][i] + "```" + "\n"
 88 |         #     )
 89 |         # prompt += "### Instruction:\n" + test_data["description"].strip() + "\n"
 90 |         # prompt += "### Response:\n"
 91 | 
 92 |     return prompt
 93 | 
 94 | 
 95 | def extract_demonstration(train_dataset, shot, code_type):
 96 |     if 'transformed' not in code_type:
 97 |         problem_index_with_both_sc_and_mc = []
 98 |         for i, data in enumerate(train_dataset):
 99 |             num_sc = len(data['monolithic_codes']['monolithic_code'])
100 |             num_mc = len(data['modular_codes']['modular_code']) 
101 |             if num_sc > 0 and num_mc > 0:
102 |                 problem_index_with_both_sc_and_mc.append(i)
103 | 
104 |         demonstration = defaultdict(list)
105 |         for i in random.sample(problem_index_with_both_sc_and_mc, shot):
106 |             data = train_dataset[i]
107 |             # modularity check
108 |             # print(f'problem {i}')
109 |             # tmp = []
110 |             # for code in data['modular_codes']['modular_code']:
111 |                 # modularity = get_code_modularity_score(code)
112 |                 # tmp.append(modularity)
113 |             # print(tmp)        
114 |             if code_type == 'monolithic':
115 |                 demonstration['description'].append(data['problem_description'].strip())
116 |                 demonstration['code'].append(data['monolithic_codes']['monolithic_code'][0].strip()) # pick the first code
117 |                 # print(get_code_modularity_score(data['monolithic_codes']['monolithic_code'][0]))
118 |             elif code_type == 'modular':
119 |                 demonstration['description'].append(data['problem_description'].strip())
120 |                 demonstration['code'].append(data['modular_codes']['modular_code'][0].strip())
121 |                 print(get_code_modularity_score(data['modular_codes']['modular_code'][0]))
122 |                 print(data['modular_codes']['modular_code'][0])
123 | 
124 |         return demonstration
125 |     
126 |     else:
127 |         if code_type == 'transformed_modular':
128 |             key = 'transformed_mc'
129 |         elif code_type == 'transformed_monolithic':
130 |             key = 'transformed_sc'
131 | 
132 |         demonstration = defaultdict(list)
133 |         for i in range(shot):
134 |             demonstration['description'].append(dataset['problem_description'][i].strip())
135 |             demonstration['code'].append(dataset[key][i].strip())
136 | 
137 |         return demonstration
138 | 
139 | 
140 | def main():
141 |     parser = argparse.ArgumentParser()
142 |     parser.add_argument("--seed", type=int, required=True, default=42)
143 |     parser.add_argument(
144 |         "--model", type=str, required=True, default="meta-llama/CodeLlama-7b-hf"
145 |     )
146 |     parser.add_argument("--num_gpu", type=int, required=True, default=1, help="total number of gpus used")
147 |     parser.add_argument("--dtype", type=str, required=True, default="float16")
148 |     parser.add_argument("--num_icl_shot", type=int, required=True, default=2)
149 |     parser.add_argument(
150 |         "--num_gen",
151 |         type=int,
152 |         required=True,
153 |         default=1,
154 |         help="number of solutions generated per problem",
155 |     )
156 |     parser.add_argument(
157 |         "--temperature",
158 |         type=float,
159 |         required=True,
160 |         default=0,
161 |         help="0 means greedy decoding for vllm",
162 |     )
163 |     parser.add_argument("--max_new_token", type=int, required=True, default=1024)
164 |     parser.add_argument("--top_p", type=float, required=True, default=0.95)
165 |     parser.add_argument(
166 |         "--swap_space",
167 |         type=int,
168 |         required=False,
169 |         default=4,
170 |         help="The size (GiB) of CPU memory per GPU to use as swap space",
171 |     )
172 |     parser.add_argument('--code_type', type=str, required=True, default='monolithic')
173 |     parser.add_argument('--degree', type=str, required=True, default='low')
174 |     parser.add_argument('--debug_mode', type=int, required=True, default=0)
175 |     parser.add_argument('--chkpt', type=str, required=True, default=0)
176 |     # additional arguments candidiates:
177 |     # max_model_len
178 |     # stop
179 |     # start_token, end_token
180 |     args = parser.parse_args()
181 | 
182 |     # load model
183 |     # when initializing VLLM engine, random.seed() is called internally.
184 |     # so, set_seed() should be called after initializing VLLM engine.
185 |     model = LLM(
186 |         model=args.model,
187 |         tensor_parallel_size=args.num_gpu,
188 |         dtype=args.dtype,
189 |         max_model_len=8192,
190 |         swap_space=args.swap_space,
191 |     )
192 |     
193 |     # all models are fine-tuned with "Q:,, A:,," format
194 |     stop = ["Q:", "A:"]
195 |         
196 |     sampling_params = SamplingParams(
197 |         n=args.num_gen,
198 |         temperature=args.temperature,
199 |         top_p=args.top_p,
200 |         max_tokens=args.max_new_token,
201 |         stop=stop,
202 |     )
203 | 
204 |     # load code contest test dataset
205 |     test_dataset = load_dataset(
206 |         "deepmind/code_contests",
207 |         split="test",
208 |     )
209 |         
210 |     # set seed
211 |     set_seed(args.seed)
212 | 
213 |     base_directory = os.path.dirname(__file__)
214 |     
215 |     # monolithic(sc) or modular(mc) demonstration
216 |     if 'transformed' not in args.code_type: 
217 |         dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', 'my_code_contests_divided_train.jsonl'))
218 |     # transformed monolithic(tsc) or transformed modular(tmc) demonstration
219 |     else:
220 |         dataset = read_jsonl_to_dict(os.path.join(os.path.dirname(__file__), 'data', f'monolithic_2shot_demonstration_{args.seed}seed.jsonl'))[0]
221 |     
222 |     demonstration = extract_demonstration(dataset, args.num_icl_shot, args.code_type)
223 |     
224 |     if "CodeLlama" in args.model:
225 |         file_name = f"CodeLlama_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
226 |     elif "deepseek" in args.model:
227 |         file_name = f"DeepSeek_{args.degree}_mod_chkpt{args.chkpt}_{args.num_icl_shot}shot_{args.num_gen}gen_{args.temperature}temp_{args.seed}seed_icl_result.jsonl"
228 | 
229 |     
230 |     if os.path.exists(os.path.join(base_directory, "result/ft", file_name)):
231 |         print(f'{file_name} already exists.')
232 |         return
233 | 
234 |     # make prompt for each test data
235 |     if args.debug_mode:
236 |         test_dataset = list(test_dataset)[:10] # for test
237 | 
238 |     prompts = []
239 |     for test_data in test_dataset:
240 |         prompt = make_prompt(args, demonstration, test_data)
241 |         prompts.append(prompt)
242 |         
243 |     # inference using vllm
244 |     generations = []
245 |     solutions = []
246 |     
247 |     # generate solution code using vllm
248 |     outputs = model.generate(
249 |         prompts, sampling_params=sampling_params, use_tqdm=True
250 |     )
251 |     for idx, output in enumerate(outputs):
252 |         # for each input in the prompts, args.gen_num number of outputs are generated
253 |         generations_ = [outs.text.strip() for outs in output.outputs]
254 |         assert len(generations_) == args.num_gen
255 |         # extract solution code from generated code
256 |         solutions_ = [
257 |             extract_solution(args, generation) for generation in generations_
258 |         ]
259 |         if args.debug_mode:
260 |             print(f'problem {idx}, prompt:')
261 |             print(prompts[idx])
262 |             print('-' * 100)
263 |             print('generation:')
264 |             print(generations_[0].strip())
265 |             print('-' * 100)
266 |             print('solution:')
267 |             print(solutions_[0].strip())
268 |             print('*' * 100)
269 |         
270 |         # save generated solutions (list)
271 |         generations.append(generations_)
272 |         solutions.append(solutions_)
273 |         
274 |     # save generated solutions
275 |     result = []
276 |     for i, test_data in enumerate(test_dataset):
277 |         result.append(
278 |             {
279 |                 "name": test_data["name"],
280 |                 "description": test_data["description"],
281 |                 "public_tests": test_data["public_tests"],
282 |                 "private_tests": test_data["private_tests"],
283 |                 "difficulty": test_data["difficulty"],
284 |                 "cf_rating": test_data["cf_rating"], # difficulty level
285 |                 "generated_solutions": generations[i], # list of generated solutions
286 |                 "extracted_solutions": solutions[i],
287 |                 "prompt": prompts[i],
288 |                 "demonstration": demonstration, # contains code and its description
289 |             }
290 |         )
291 |     
292 |     if not args.debug_mode:
293 |         write_dict_to_jsonl(result, os.path.join(base_directory, "result/ft", file_name))
294 |     print(f'program ends.')
295 | 
296 | 
297 | if __name__ == "__main__":
298 |     main()
299 | 


--------------------------------------------------------------------------------
/apps/data/2shot_demonstration_42seed.json:
--------------------------------------------------------------------------------
1 | {"problem_id":50,"problem_description":"Karlsson has recently discovered a huge stock of berry jam jars in the basement of the house. More specifically, there were $2n$ jars of strawberry and blueberry jam.\n\nAll the $2n$ jars are arranged in a row. The stairs to the basement are exactly in the middle of that row. So when Karlsson enters the basement, he sees exactly $n$ jars to his left and $n$ jars to his right.\n\nFor example, the basement might look like this: [Image] \n\nBeing the starightforward man he is, he immediately starts eating the jam. In one minute he chooses to empty either the first non-empty jar to his left or the first non-empty jar to his right.\n\nFinally, Karlsson decided that at the end the amount of full strawberry and blueberry jam jars should become the same.\n\nFor example, this might be the result: [Image] He has eaten $1$ jar to his left and then $5$ jars to his right. There remained exactly $3$ full jars of both strawberry and blueberry jam. \n\nJars are numbered from $1$ to $2n$ from left to right, so Karlsson initially stands between jars $n$ and $n+1$.\n\nWhat is the minimum number of jars Karlsson is required to empty so that an equal number of full strawberry and blueberry jam jars is left?\n\nYour program should answer $t$ independent test cases.\n\n\n-----Input-----\n\nThe first line contains one integer $t$ ($1 \\le t \\le 1000$) \u2014 the number of test cases.\n\nThe first line of each test case contains a single integer $n$ ($1 \\le n \\le 10^5$).\n\nThe second line of each test case contains $2n$ integers $a_1, a_2, \\dots, a_{2n}$ ($1 \\le a_i \\le 2$) \u2014 $a_i=1$ means that the $i$-th jar from the left is a strawberry jam jar and $a_i=2$ means that it is a blueberry jam jar.\n\nIt is guaranteed that the sum of $n$ over all test cases does not exceed $10^5$.\n\n\n-----Output-----\n\nFor each test case print the answer to it \u2014 the minimum number of jars Karlsson is required to empty so that an equal number of full strawberry and blueberry jam jars is left.\n\n\n-----Example-----\nInput\n4\n6\n1 1 1 2 2 1 2 1 2 1 1 2\n2\n1 2 1 2\n3\n1 1 1 1 1 1\n2\n2 1 1 1\n\nOutput\n6\n0\n6\n2\n\n\n\n-----Note-----\n\nThe picture from the statement describes the first test case.\n\nIn the second test case the number of strawberry and blueberry jam jars is already equal.\n\nIn the third test case Karlsson is required to eat all $6$ jars so that there remain $0$ jars of both jams.\n\nIn the fourth test case Karlsson can empty either the second and the third jars or the third and the fourth one. The both scenarios will leave $1$ jar of both jams.","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"for tcase in range(int(input())):\n    n=int(input())\n    ls = list(map(int, input().split()))\n    oneneed = 2*(n - ls.count(1))\n    ldct = {0:0}\n    ctr = 0\n    eaten = 0\n    for i in range(n-1,-1,-1):\n        eaten += 1\n        ctr += (1 if ls[i] == 2 else -1)\n        if ctr not in ldct:\n            ldct[ctr] = eaten\n\n    rdct = {0:0}\n    ctr = 0\n    eaten = 0\n    for i in range(n,2*n):\n        eaten += 1\n        ctr += (1 if ls[i] == 2 else -1)\n        if ctr not in rdct:\n            rdct[ctr] = eaten\n    \n\n    best=99**99\n    for k in list(rdct.keys()):\n        otk = oneneed - k\n        if otk in ldct:\n            best = min(best, rdct[k]+ldct[otk])\n    print(best)","sc_cc":10.0,"mc":"import sys\n\ndef minp():\n\treturn sys.stdin.readline().strip()\n\ndef mint():\n\treturn int(minp())\n\ndef mints():\n\treturn list(map(int,minp().split()))\n\ndef solve():\n\tn = mint()\n\ta = list(mints())\n\tc = dict()\n\tc[0] = 2*n\n\td = 0\n\t\n\tfor i in range(2*n-1,n-1,-1):\n\t\tif a[i] == 1:\n\t\t\td += 1\n\t\telse:\n\t\t\td -= 1\n\t\t\n\t\tc[d] = i\n\t\n\td = 0\n\tr = 2*n\n\tr = min(r, n + c[0] - n)\n\tfor i in range(n):\n\t\tif a[i] == 1:\n\t\t\td += 1\n\t\telse:\n\t\t\td -= 1\n\t\t\n\t\tif (-d) in c:\n\t\t\tr = min(r, n - i - 1 + c[-d] - n)\n\t\n\treturn r\n\n\nfor i in range(mint()):\n\tprint(solve())","mc_cc":2.2,"transformed_mc":["\ndef find_jars_to_empty(t, test_cases):\n    result = []\n    for i in range(t):\n        n = test_cases[i][0]\n        ls = test_cases[i][1]\n        oneneed = 2 * (n - ls.count(1))\n        ldct, rdct = get_counts(n, ls)\n        best = find_min_jars(ldct, rdct, oneneed)\n        result.append(best)\n    return result\n\ndef get_counts(n, ls):\n    ldct = {0: 0}\n    rdct = {0: 0}\n    ctr = 0\n    eaten = 0\n    for i in range(n-1, -1, -1):\n        eaten += 1\n        ctr += (1 if ls[i] == 2 else -1)\n        if ctr not in ldct:\n            ldct[ctr] = eaten\n\n    ctr = 0\n    eaten = 0\n    for i in range(n, 2*n):\n        eaten += 1\n        ctr += (1 if ls[i] == 2 else -1)\n        if ctr not in rdct:\n            rdct[ctr] = eaten\n    return ldct, rdct\n\ndef find_min_jars(ldct, rdct, oneneed):\n    best = 99**99\n    for k in list(rdct.keys()):\n        otk = oneneed - k\n        if otk in ldct:\n            best = min(best, rdct[k] + ldct[otk])\n    return best\n\ndef main():\n    t = int(input())\n    test_cases = []\n    for _ in range(t):\n        n = int(input())\n        jars = list(map(int, input().split()))\n        test_cases.append((n, jars))\n\n    result = find_jars_to_empty(t, test_cases)\n    for res in result:\n        print(res)\n\nif __name__ == '__main__':\n    main()\n"],"transformed_sc":["t = int(input())\ntest_cases = []\nfor _ in range(t):\n    n = int(input())\n    jars = list(map(int, input().split()))\n    test_cases.append((n, jars))\nresult = []\nfor i in range(t):\n    n = test_cases[i][0]\n    ls = test_cases[i][1]\n    oneneed = 2 * (n - ls.count(1))\n    ldct = {0: 0}\n    rdct = {0: 0}\n    ctr = 0\n    eaten = 0\n    for j in range(n-1, -1, -1):\n        eaten += 1\n        ctr += (1 if ls[j] == 2 else -1)\n        if ctr not in ldct:\n            ldct[ctr] = eaten\n    ctr = 0\n    eaten = 0\n    for j in range(n, 2*n):\n        eaten += 1\n        ctr += (1 if ls[j] == 2 else -1)\n        if ctr not in rdct:\n            rdct[ctr] = eaten\n    best = 99**99\n    for k in list(rdct.keys()):\n        otk = oneneed - k\n        if otk in ldct:\n            best = min(best, rdct[k] + ldct[otk])\n    result.append(best)\nfor res in result:\n    print(res)"]}
2 | {"problem_id":692,"problem_description":"There is crazy man named P29892P. He always tries to do crazy things as he thinks. One day he invented a machine and named it as ANGEN. The ANGEN is used to perform range operations. The range operation means performing operations on range values from {L..R}. At any time it performs operations on values in between L to R.\nANGEN can perform following operations\nU I V - Update the value present at I  with value V\nA L R - Find the sum between range L and R\nM L R - Find the maximum number between L and R\nm L R - Find the minimum number between L and R\nS L R - Find second maximum value in between L and R\ns L R - Find second mimimum value in between L and R\nIf it is not possible perform operation ANGEN returns \u201cNA\u201d with out quotes.\nFor Invalid operations ANGEN returns \u201c!!!\u201d with out quotes.\nNow P29892P challenging his friends and you too, to build his invention with yourown code. So it's your time to defeat P29892P by implementing his invention with your own ability. Let's go and solve the problem.\n\n-----Input-----\nInput description.\n- The first line of the input contains an integer N denoting the number of integers. \n- The next line contains N space separated integers.\"\n- The next line contains a single integer Q denoting the number of Questions.\n-  The next Q lines contains T Question type , L and R.\n\n-----Output-----\nPrint output for each question in separate line.\n\n-----Constraints-----\nShould contain all the constraints on the input data that you may have. Format it like:\n- 1 \u2264 N \u2264 100000\n- 1 \u2264 values[i] \u2264 1000000000\n- 1 \u2264 Q \u2264 10000\n- T in { A..Z, a..z }\n- 1 \u2264 L \u2264 R \u2264 N\n\n-----Example-----\nInput:\n6\n1 2 5 3 10 6\n6\nA 1 5\nM 1 3\nm 5 6\ns 3 6\nU 1 7\nS 1 2\n\nOutput:\n21\n5\n6\n5\n2\n\n-----Explanation-----\n...","starter_code":"","sc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Please wrap your code answer using ```:","mc_instruction":"Write a python code to solve the following coding problem that obeys the constraints and passes the example test cases. The output code needs to read from and write to standard IO. Ensure modularity of the python code by dividing the code into smaller, useful functions to solve the given problem. Please wrap your code answer using ```:","sc":"n = eval(input())\narr = list(map(int,input().split()))\nq = eval(input())\nwhile q:\n q -= 1\n ar = input().split()\n t = ar[0]\n l = int(ar[1])\n r = int(ar[2])\n l -= 1\n if t == 'U':\n  arr[l] = r\n elif t == 'A':\n  print(sum(arr[l:r]))\n elif t == 'M':\n  print(max(arr[l:r]))\n elif t == 'm':\n  print(min(arr[l:r]))\n elif t == 'S':\n  m = max(arr[l:r])\n  m2 = -1\n  for i in range(l, r):\n   if arr[i] < m and arr[i] > m2:\n    m2 = arr[i]\n  print(m2)\n elif t == 's':\n  m = min(arr[l:r])\n  m2 = 1000000000000\n  for i in range(l, r):\n   if arr[i] > m and arr[i] < m2:\n    m2 = arr[i]\n  print(m2)","sc_cc":14.0,"mc":"def na(l,r):\n if l-1>=r or l>n or r>n:\n  return True\n else:\n  return False\n\ndef na_print():\n print('NA')\n\n\n\ndef func(t,l,r):\n if t=='U':\n  try:\n   a[l-1]=r\n   return ''\n  except:\n   print('NA')\n   return ''\n elif na(l,r):\n  na_print()\n  return ''\n \n if t=='A':\n  print(sum(a[l-1:r]))\n  \n elif t=='M':\n  print(max(a[l-1:r]))\n  \n elif t=='m':\n  print(min(a[l-1:r]))\n  \n elif t=='S':\n  bb = list(set(a[l-1:r]))\n  bb.sort()\n  print(bb[-2])\n  \n elif t=='s':\n  bb = list(set(a[l-1:r]))\n  bb.sort()\n  print(bb[1])\n  \n else:\n  print('!!!')\n\n\n\n\nn = eval(input())\na = list(map(int, input().split()))\nq = eval(input())\n\nfor i in range(q):\n t,l,r = input().split()\n func(t,int(l),int(r))","mc_cc":4.0,"transformed_mc":["\ndef update_value(arr, idx, val):\n    arr[idx] = val\n\ndef find_sum(arr, start, end):\n    return sum(arr[start:end])\n\ndef find_max(arr, start, end):\n    return max(arr[start:end])\n\ndef find_min(arr, start, end):\n    return min(arr[start:end])\n\ndef find_second_max(arr, start, end):\n    max_val = max(arr[start:end])\n    second_max = -1\n    for i in range(start, end):\n        if arr[i] < max_val and arr[i] > second_max:\n            second_max = arr[i]\n    return second_max\n\ndef find_second_min(arr, start, end):\n    min_val = min(arr[start:end])\n    second_min = 1000000000000\n    for i in range(start, end):\n        if arr[i] > min_val and arr[i] < second_min:\n            second_min = arr[i]\n    return second_min\n\ndef main():\n    n = int(input())\n    arr = list(map(int, input().split()))\n    q = int(input())\n    \n    while q:\n        q -= 1\n        ar = input().split()\n        t = ar[0]\n        l = int(ar[1]) - 1\n        r = int(ar[2])\n        \n        if t == 'U':\n            update_value(arr, l, r)\n        elif t == 'A':\n            print(find_sum(arr, l, r))\n        elif t == 'M':\n            print(find_max(arr, l, r))\n        elif t == 'm':\n            print(find_min(arr, l, r))\n        elif t == 'S':\n            print(find_second_max(arr, l, r))\n        elif t == 's':\n            print(find_second_min(arr, l, r))\n\nif __name__ == '__main__':\n    main()\n"],"transformed_sc":["n = int(input())\narr = list(map(int, input().split()))\nq = int(input())\n\nwhile q:\n    q -= 1\n    ar = input().split()\n    t = ar[0]\n    l = int(ar[1]) - 1\n    r = int(ar[2])\n    \n    if t == 'U':\n        arr[l] = r\n    elif t == 'A':\n        print(sum(arr[l:r]))\n    elif t == 'M':\n        print(max(arr[l:r]))\n    elif t == 'm':\n        print(min(arr[l:r]))\n    elif t == 'S':\n        max_val = max(arr[l:r])\n        second_max = -1\n        for i in range(l, r):\n            if arr[i] < max_val and arr[i] > second_max:\n                second_max = arr[i]\n        print(second_max)\n    elif t == 's':\n        min_val = min(arr[l:r])\n        second_min = 1000000000000\n        for i in range(l, r):\n            if arr[i] > min_val and arr[i] < second_min:\n                second_min = arr[i]\n        print(second_min)"]}
3 | 


--------------------------------------------------------------------------------