",
18 | "lstrip": false,
19 | "normalized": true,
20 | "rstrip": false,
21 | "single_word": false
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/gem5/template_config.yaml:
--------------------------------------------------------------------------------
1 | model_generated_outputs_path: "PATH_TO_YOUR_OUTPUTS"
2 | reference_file_path: "PATH_TO_REFERENCE_FILE_JSONL" # The path to the reference file. This should be the reference .jsonl file containing the reference outputs in addition to all other metadata in the test set file.
3 | output_dir: "PATH_TO_DIRECTORY_WHERE_YOU_WANT_TO_SAVE_EVALUATION_RESULTS"
4 | is_prompt_based: False # should always be False
5 | cpus_available: -1
6 | model_generated_potentially_faster_code_col: "generated_answers" # column in the model-generated outputs that contains the generated code, it should be a list of strings
7 | num_problems_to_evaluate: -1 # -1 means evaluate all problems
8 |
--------------------------------------------------------------------------------
/finetuning/tokenizer_files/7B/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "additional_special_tokens": [
3 | "▁",
4 | "▁",
5 | "▁",
6 | "▁"
7 | ],
8 | "bos_token": {
9 | "content": "",
10 | "lstrip": false,
11 | "normalized": true,
12 | "rstrip": false,
13 | "single_word": false
14 | },
15 | "eos_token": {
16 | "content": "",
17 | "lstrip": false,
18 | "normalized": true,
19 | "rstrip": false,
20 | "single_word": false
21 | },
22 | "unk_token": {
23 | "content": "",
24 | "lstrip": false,
25 | "normalized": true,
26 | "rstrip": false,
27 | "single_word": false
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/scripts/template_config.yaml:
--------------------------------------------------------------------------------
1 | text_gen_args:
2 | generation_model_name: "your_model_name"
3 | volume_mount: "/path/to/your/volume"
4 | max_best_of: 5
5 | port: 4242
6 |
7 | sampling_args:
8 | test_file: "/path/to/your/test_file"
9 | output_file: "/path/to/your/output_file"
10 | do_sample: true
11 | num_samples: 8
12 | max_new_tokens: 1000
13 | temperature: 0.7
14 | num_threads: 20
15 | prompt_name: "code_opt"
16 |
17 | eval_args:
18 | output_dir: "/path/to/your/evaluation_output_directory"
19 | is_prompt_based: false
20 | cpus_available: -1
21 | model_generated_potentially_faster_code_col: "generated_answers"
22 | num_problems_to_evaluate: -1
23 |
--------------------------------------------------------------------------------
/openai_finetuning/README.md:
--------------------------------------------------------------------------------
1 | The script `finetune_openai.py` was used to finetune GPT3.5 Turbo. Its usage is as follows:
2 |
3 | ```bash
4 | python finetune_openai.py PATH_TO_CONFIG.yaml
5 | ```
6 |
7 | We've included a sample config file `config.yaml` in this directory. The config file should contain the following fields:
8 |
9 | ```yaml
10 | api_key: "YOUR_OPENAI_API_KEY"
11 | organization: "YOUR_OPENAI_ORGANIZATION (optional)"
12 | input_train_path: "PATH_TO_TRAINING_DATA"
13 | input_test_path: "PATH_TO_VALIDATION_DATA"
14 | max_train: -1
15 | max_val: -1
16 | max_len: -1
17 | epochs: NUMBER_OF_EPOCHS (we used 1)
18 | output_dir: "PATH_TO_OUTPUT_DIR"
19 | model_suffix: "SUFFIX_FOR_MODEL_NAME"
20 | ```
21 |
--------------------------------------------------------------------------------
/finetuning/tokenizer_files/13B/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "add_bos_token": true,
3 | "add_eos_token": false,
4 | "bos_token": {
5 | "__type": "AddedToken",
6 | "content": "",
7 | "lstrip": false,
8 | "normalized": true,
9 | "rstrip": false,
10 | "single_word": false
11 | },
12 | "clean_up_tokenization_spaces": false,
13 | "eos_token": {
14 | "__type": "AddedToken",
15 | "content": "",
16 | "lstrip": false,
17 | "normalized": true,
18 | "rstrip": false,
19 | "single_word": false
20 | },
21 | "legacy": null,
22 | "model_max_length": 1000000000000000019884624838656,
23 | "pad_token": null,
24 | "sp_model_kwargs": {},
25 | "tokenizer_class": "CodeLlamaTokenizer",
26 | "unk_token": {
27 | "__type": "AddedToken",
28 | "content": "",
29 | "lstrip": false,
30 | "normalized": true,
31 | "rstrip": false,
32 | "single_word": false
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/docs/PIE_experiments_readme.md:
--------------------------------------------------------------------------------
1 | ## PIE Experiments
2 |
3 | The following directory containe the outputs from our experiments generally in the following format
4 |
5 | - `aggregated_test_results.csv`: contains final statistics including those found in the paper as well as additional statistics, such as %Opt reported in the paper at higher thresholds for speedup like 1.5x, 2.0x and so on.
6 | - `test_results.jsonl`: contains outputs and additional metrics after benchmarking with `gem5`
7 | - `melted_test_results.jsonl`: this is an intermediate file containing test_results.jsonl melted into a long format for easier analysis.
8 | - `additional_test_results.jsonl`: is an intermediate file which we did not use and may or may not contain additional `gem5` benchmarking information.
9 | - `raw_test_results.jsonl`: is an intermediate file for test_results.jsonl without benchmarking info. This usually will exist, but in some cases it won't. test_results.jsonl and melted_test_results.jsonl are derived from this file and will also contain any information in this file in addition to more.
--------------------------------------------------------------------------------
/finetuning/train.sh:
--------------------------------------------------------------------------------
1 | OUTPUT_DIR=${OUTPUT_DIR:-"saved_models/code_opt"}
2 | BASE_MODEL=${BASE_MODEL:-"codellama/CodeLlama-7b-hf"}
3 |
4 | torchrun --nproc_per_node=8 \
5 | --master_port=1234 finetune.py \
6 | --base_model $BASE_MODEL \
7 | --data_path ./data/ \
8 | --output_dir $OUTPUT_DIR \
9 | --batch_size 32 \
10 | --micro_batch_size 2 \
11 | --num_epochs 1 \
12 | --learning_rate 1e-5 \
13 | --cutoff_len 2000 \
14 | --train_on_inputs False \
15 | --prompt_template_name "code_opt" \
16 | --use_flash_attention True \
17 | --train_name "train.jsonl" \
18 | --val_name "val.jsonl" \
19 | --test_name "test.jsonl" \
20 | --wandb_project "code_opt" \
21 |
22 | # Copy tokenizer files to appropriate location, modify this if model is different
23 | if [[ $BASE_MODEL == *"7b"* ]]; then
24 | cp -r ./tokenizer_files/7B/* $OUTPUT_DIR
25 | elif [[ $BASE_MODEL == *"13b"* ]]; then
26 | cp -r ./tokenizer_files/13B/* $OUTPUT_DIR
27 | else
28 | echo "Base model size not recognized. Tokenizer files not copied."
29 | fi
30 |
--------------------------------------------------------------------------------
/finetuning/tokenizer_files/7B/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "add_bos_token": true,
3 | "add_eos_token": false,
4 | "additional_special_tokens": [
5 | "▁",
6 | "▁",
7 | "▁",
8 | "▁"
9 | ],
10 | "bos_token": {
11 | "__type": "AddedToken",
12 | "content": "",
13 | "lstrip": false,
14 | "normalized": true,
15 | "rstrip": false,
16 | "single_word": false
17 | },
18 | "clean_up_tokenization_spaces": false,
19 | "eos_token": {
20 | "__type": "AddedToken",
21 | "content": "",
22 | "lstrip": false,
23 | "normalized": true,
24 | "rstrip": false,
25 | "single_word": false
26 | },
27 | "eot_token": "▁",
28 | "fill_token": "",
29 | "legacy": null,
30 | "middle_token": "▁",
31 | "model_max_length": 1000000000000000019884624838656,
32 | "pad_token": null,
33 | "prefix_token": "▁",
34 | "sp_model_kwargs": {},
35 | "suffix_first": false,
36 | "suffix_token": "▁",
37 | "tokenizer_class": "CodeLlamaTokenizer",
38 | "unk_token": {
39 | "__type": "AddedToken",
40 | "content": "",
41 | "lstrip": false,
42 | "normalized": true,
43 | "rstrip": false,
44 | "single_word": false
45 | },
46 | "use_default_system_prompt": false
47 | }
48 |
--------------------------------------------------------------------------------
/data_augmentation/data_augmentation_driver_final.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PATH_TO_PIE=""
4 | WORKING_DIR=""
5 | RESULTS_DIR=""
6 | TIMEOUT=5
7 | MAX_TOKENS=16000
8 | MAX_PROMPT_LENGTH=10000
9 | MODEL="gpt-3.5-turbo-16k-0613"
10 | GENERATION_STRATEGY="code_only"
11 |
12 |
13 | ## if constant factor, expected 10K programs for $15-$20
14 | PARAM_SETS=(
15 | "1.0 0.9 5 2000" # Temperature=1.0, Top-p=0.9, Num_Samples=5, Total_Iterations=2000; 10K generations
16 | )
17 |
18 | # Loop through each set of parameters and run the Python script
19 | for PARAM_SET in "${PARAM_SETS[@]}"; do
20 | # Split the parameter set into individual variables
21 | read -r TEMPERATURE TOP_P NUM_SAMPLES TOTAL_ITERATIONS <<< "$PARAM_SET"
22 |
23 | echo "Running with Temperature=$TEMPERATURE, Top-p=$TOP_P, Num_Samples=$NUM_SAMPLES, Total_Iterations=$TOTAL_ITERATIONS"
24 |
25 | python3 $PATH_TO_PIE/src/data_augmentation/data_augmentation.py \
26 | --working_dir $WORKING_DIR \
27 | --results_dir_root $RESULTS_DIR \
28 | --timeout $TIMEOUT \
29 | --temperature $TEMPERATURE \
30 | --top_p $TOP_P \
31 | --max_tokens $MAX_TOKENS \
32 | --max_prompt_length $MAX_PROMPT_LENGTH \
33 | --model $MODEL \
34 | --generation_strategy $GENERATION_STRATEGY \
35 | --num_samples $NUM_SAMPLES \
36 | --total_iterations $TOTAL_ITERATIONS 2>&1 | tee $RESULTS_DIR/temperature_${TEMPERATURE}_top_p_${TOP_P}_num_samples_${NUM_SAMPLES}_total_iterations_${TOTAL_ITERATIONS}.log
37 | done:q
38 |
--------------------------------------------------------------------------------
/data_augmentation/data_augmentation_driver_search.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PATH_TO_PIE=""
4 | WORKING_DIR=""
5 | RESULTS_DIR=""
6 | TIMEOUT=5
7 | MAX_TOKENS=16000
8 | MAX_PROMPT_LENGTH=10000
9 | MODEL="gpt-3.5-turbo-16k-0613"
10 | GENERATION_STRATEGY="code_only"
11 |
12 | PARAM_SETS=(
13 | "1.2 0.8 5 20"
14 | "1 0.8 5 20"
15 | "1 0.9 5 20"
16 | "0.8 1 5 20"
17 | "0.5 1 5 20"
18 | "1 0.8 1 100"
19 | "1 0.9 1 100"
20 | "1 1 1 100"
21 | "0.7 1 1 100"
22 | "0.5 1 1 100"
23 | )
24 |
25 |
26 | # Loop through each set of parameters and run the Python script
27 | for PARAM_SET in "${PARAM_SETS[@]}"; do
28 | # Split the parameter set into individual variables
29 | read -r TEMPERATURE TOP_P NUM_SAMPLES TOTAL_ITERATIONS <<< "$PARAM_SET"
30 |
31 | echo "Running with Temperature=$TEMPERATURE, Top-p=$TOP_P, Num_Samples=$NUM_SAMPLES, Total_Iterations=$TOTAL_ITERATIONS"
32 |
33 | python3 $PATH_TO_PIE/src/data_augmentation/data_augmentation.py \
34 | --working_dir $WORKING_DIR \
35 | --results_dir_root $RESULTS_DIR \
36 | --timeout $TIMEOUT \
37 | --temperature $TEMPERATURE \
38 | --top_p $TOP_P \
39 | --max_tokens $MAX_TOKENS \
40 | --max_prompt_length $MAX_PROMPT_LENGTH \
41 | --model $MODEL \
42 | --generation_strategy $GENERATION_STRATEGY \
43 | --num_samples $NUM_SAMPLES \
44 | --total_iterations $TOTAL_ITERATIONS 2>&1 | tee $RESULTS_DIR/temperature_${TEMPERATURE}_top_p_${TOP_P}_num_samples_${NUM_SAMPLES}_total_iterations_${TOTAL_ITERATIONS}.log
45 | done:q
46 |
--------------------------------------------------------------------------------
/finetuning/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 |
163 | wandb
164 | wandb/*
165 | saved_models/*
166 | saved_models/
--------------------------------------------------------------------------------
/finetuning/utils/prompter.py:
--------------------------------------------------------------------------------
1 | """
2 | A dedicated helper to manage templates and prompt building.
3 |
4 | Code adapted from the alpaca-lora repository at https://github.com/tloen/alpaca-lora/blob/main/utils/prompter.py
5 | """
6 |
7 | import json
8 | import os.path as osp
9 | from typing import Union
10 |
11 |
12 | class Prompter(object):
13 |
14 | __slots__ = ("template", "_verbose", "pctile_test")
15 |
16 | def __init__(self, template_name: str = "", verbose: bool = False):
17 | self._verbose = verbose
18 | self.pctile_test = False
19 | if template_name == "code_opt_w_speedup_pctile_test":
20 | self.pctile_test = True
21 | template_name = "code_opt_w_speedup_pctile"
22 | if not template_name:
23 | # Enforce the default here, so the constructor can be called with '' and will not break.
24 | template_name = "code_opt"
25 | file_name = osp.join("templates", f"{template_name}.json")
26 | if not osp.exists(file_name):
27 | raise ValueError(f"Can't read {file_name}")
28 | with open(file_name) as fp:
29 | self.template = json.load(fp)
30 | if self._verbose:
31 | print(
32 | f"Using prompt template {template_name}: {self.template['description']}"
33 | )
34 |
35 | print(f"template_name: {template_name}")
36 | print(f"pcitle_test: {self.pctile_test}")
37 |
38 | def generate_prompt(
39 | self,
40 | src_code: str,
41 | tgt_code: Union[None, str] = None,
42 | speedup_desc: Union[None, str] = None,
43 | speedup_bin: Union[None, str] = None,
44 | pctile: Union[None, str] = None,
45 | code_cutoff: int = 1500,
46 | ) -> str:
47 | # returns the full prompt from src_code and optional input
48 | # if a tgt_code (=response, =output) is provided, it's also appended.
49 |
50 | # take first 1500 chars of src_code and tgt_code to make sure the prompt is not too long
51 | src_code = src_code[:code_cutoff]
52 |
53 | if speedup_desc and speedup_bin:
54 | raise ValueError("Both speedup_desc and speedup_bin can mot be set.")
55 |
56 | if tgt_code:
57 | tgt_code = tgt_code[:code_cutoff]
58 |
59 | if speedup_desc:
60 | try:
61 | res = self.template["prompt_no_input"].format(
62 | src_code=src_code,
63 | speedup_desc=speedup_desc
64 | )
65 | except Exception as e:
66 | print("Oops! There is no speedup_desc in the template prompt!")
67 | elif speedup_bin:
68 | try:
69 | res = self.template["prompt_no_input"].format(
70 | src_code=src_code,
71 | speedup_bin=speedup_bin
72 | )
73 | except Exception as e:
74 | print("Oops! There is no speedup_bin in the template prompt!")
75 | elif pctile:
76 | try:
77 | res = self.template["prompt_no_input"].format(
78 | src_code=src_code,
79 | pctile=pctile
80 | )
81 | except Exception as e:
82 | print("Oops! There is no pctile in the template prompt!")
83 | elif self.pctile_test: # test time
84 | try:
85 | res = self.template["prompt_no_input"].format(
86 | src_code=src_code,
87 | pctile="10"
88 | )
89 | except Exception as e:
90 | print("Oops! There is no pctile in the template prompt!")
91 | else: # only src_code
92 | try:
93 | res = self.template["prompt_no_input"].format(
94 | src_code=src_code
95 | )
96 | except Exception as e:
97 | print("Oops! There is no src_code in the template prompt!")
98 |
99 | if tgt_code:
100 | res = f"{res}{tgt_code}"
101 |
102 | if self._verbose:
103 | print(res)
104 | return res
105 |
106 | def get_response(self, output: str) -> str:
107 | return output.split(self.template["response_split"])[1].strip()
108 |
--------------------------------------------------------------------------------
/finetuning/sample.py:
--------------------------------------------------------------------------------
1 | """
2 | Code used for sampling programs based on the text-generation-inference API at https://github.com/huggingface/text-generation-inference
3 |
4 | """
5 |
6 |
7 | from text_generation import Client
8 | import pandas as pd
9 | from utils.prompter import Prompter
10 | from tqdm import tqdm
11 | import fire
12 | import re
13 |
14 | import concurrent.futures
15 |
16 | def extract_first_program(text):
17 | # Look for the main function's start, considering possible non-standard code
18 | main_start = re.search(r"\b(?:int\s+)?main\b", text)
19 |
20 | if not main_start:
21 | return text # Return original if main is not found
22 |
23 | open_braces = 0
24 | closing_brace_position = -1
25 | main_function_started = False
26 |
27 | # Start looking for opening brace after the detected main function
28 | i = main_start.end()
29 |
30 | while i < len(text):
31 | if text[i] == "{":
32 | open_braces += 1
33 | if not main_function_started:
34 | main_function_started = True
35 |
36 | elif text[i] == "}":
37 | open_braces -= 1
38 | if open_braces == 0 and main_function_started:
39 | closing_brace_position = i
40 | break
41 |
42 | i += 1
43 |
44 | # If we found a closing brace for the first program
45 | if closing_brace_position != -1:
46 | return text[: closing_brace_position + 1]
47 | else:
48 | return text # Return original text if a matching closing brace wasn't found
49 |
50 |
51 | def postprocess(text, prompt_name):
52 |
53 | if prompt_name == 'code_opt':
54 | return extract_first_program(text)
55 | else:
56 | return text
57 |
58 |
59 | def main(
60 | test_file=None,
61 | output_file=None,
62 | do_sample=None,
63 | num_samples=8,
64 | max_new_tokens=1000,
65 | temperature=0.7,
66 | num_threads=20, # number of threads to use for parallel processing
67 | prompt_name="code_opt",
68 | ):
69 | # print do_sample
70 | print(f"do_sample: {do_sample}")
71 | # print type of do_sample
72 | print(f"type of do_sample: {type(do_sample)}")
73 |
74 | client = Client("http://127.0.0.1:8080", timeout=100)
75 |
76 | prompter = Prompter(template_name=prompt_name)
77 |
78 | print(f"prompt_name: {prompt_name}")
79 |
80 | test_df = pd.read_json(test_file, lines=True, orient="records")
81 |
82 | # create results dataframe with src_code column
83 | results_df = pd.DataFrame(columns=["src_code"])
84 | results_df["src_code"] = test_df["src_code"]
85 | # create empty column for completions
86 | results_df["generated_answers"] = results_df.apply(lambda x: [], axis=1)
87 |
88 | def process_request(index, src_code):
89 | all_completions = []
90 |
91 | prompt = prompter.generate_prompt(src_code=src_code)
92 |
93 | if do_sample:
94 | completions = client.generate(
95 | prompt,
96 | max_new_tokens=max_new_tokens,
97 | do_sample=True,
98 | temperature=temperature,
99 | best_of=num_samples,
100 | )
101 | else:
102 | completions = client.generate(
103 | prompt,
104 | max_new_tokens=max_new_tokens,
105 | do_sample=False,
106 | # best_of=num_samples,
107 | )
108 |
109 | # get all completions from output
110 | best_of_sequences = [
111 | completions.details.best_of_sequences[i].generated_text
112 | for i in range(len(completions.details.best_of_sequences))
113 | ]
114 |
115 | all_programs = [postprocess(completions.generated_text, prompt_name=prompt_name)] + [
116 | postprocess(best_of_sequences[i], prompt_name=prompt_name)
117 | for i in range(len(best_of_sequences))
118 | ]
119 |
120 | return index, all_programs
121 |
122 | # Use ThreadPoolExecutor to process in parallel
123 | with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
124 | future_to_index = {executor.submit(process_request, i, row["src_code"]): i for i, row in test_df.iterrows()}
125 | for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(test_df)):
126 | index, all_programs = future.result()
127 | results_df.at[index, "generated_answers"] = all_programs
128 |
129 | # add generated_answers column to test_df
130 | test_df["generated_answers"] = results_df["generated_answers"]
131 |
132 | # save test_df to output_file
133 | test_df.to_json(output_file, orient="records", lines=True)
134 |
135 |
136 | if __name__ == "__main__":
137 | fire.Fire(main)
138 |
--------------------------------------------------------------------------------
/finetuning/README.md:
--------------------------------------------------------------------------------
1 | # Finetuning Scripts for codellama
2 |
3 | ## Overview
4 |
5 | This subdirectory contains the scripts used to finetune codellama models for PIE. ``train.sh`` contains an example bash script for finetuning codellama-7b with the default prompt. ``sample.py`` and ``server.sh`` are used for sampling with the prompt templates and the [text-generation-inference](https://github.com/huggingface/text-generation-inference) API.
6 |
7 | ## Docker Setup for Finetuning
8 |
9 | To use the provided Docker image for finetuning, you need to install Docker and mount the directory properly. Follow these steps:
10 |
11 | 1. Install Docker: Follow the instructions on the [official Docker website](https://docs.docker.com/get-docker/) to install Docker on your system.
12 |
13 | 2. Mount the directory for the data: When running the Docker container, use the `-v` option to mount the directory containing your data. For example:
14 | ```bash
15 | docker run -v /path/to/your/data:/workspace/data yimengzeng/pie:torch201
16 | ```
17 |
18 | ## Finetuning
19 |
20 | We provide a docker image at yimengzeng/pie:torch201 which contains all of the dependencies for finetuning the model, you can also refer to docker/Dockerfile for the specific packages required to replicate the environment.
21 |
22 | To finetune codellama with the entire PIE dataset and the non-performance-conditioned prompt, run
23 |
24 | ```bash
25 | bash train.sh
26 | ```
27 |
28 | inside the Docker container.
29 |
30 | To finetune codellama with the performance-conditioned prompt, change the --prompt_template_name flag to "code_opt_w_speedup_pctile".
31 |
32 | To use different training files, modify the --train_name, --val_name, and --test_name flags in train.sh with the paths to your training, validation, and test files, respectively and mount the directory containing the files when running the Docker container.
33 |
34 |
35 | ## Sampling
36 |
37 | To generate prompts for the models, please follow details in the paper. Additional utilities for constructing prompts are located in `templates` and the `utils/prompter.py` module which constructs prompts.
38 |
39 | To sample optimized programs using the finetuned model with the text-generation-inference tool, first replace the PATH_TO_MODEL field to the actual path of the finetuned model in server.sh, and then to serve the model, run
40 |
41 | ```bash
42 | bash server.sh
43 | ```
44 | To sample from the model just served with default parameters as in the paper, run
45 |
46 | ```bash
47 | bash sample.sh
48 | ```
49 | Note that sampling does not require you to spin up the container on your own. You can modify the following parameters in `server.sh` and `sample.sh`:
50 |
51 | For `server.sh`:
52 | - `model`: Set this to the path of your finetuned model, e.g., `'codellama/CodeLlama-7b-hf'`.
53 | - `volume`: Set this to the path where your model is stored, e.g., `$PWD/saved_models/`.
54 | - `max_best_of`: Set this to the maximum number of samples to generate in parallel, e.g., `20`.
55 |
56 | For `sample.sh`:
57 | - `--test_file`: Set this to the path of your test file.
58 | - `--output_file`: Set this to the path where you want to save the results.
59 | - `--num_samples`: Set this to the number of samples you want to generate.
60 | - `--num_threads`: Set this to the number of threads you want to use for sampling.
61 | - `--prompt_name`: Set this to the name of the prompt template you want to use.
62 | - `--temperature`: Set this to the temperature parameter for sampling.
63 |
64 |
65 | ## Models
66 |
67 | Here are links to the finetuned models used in the paper and the corresponding pre-trained models used for finetuning:
68 |
69 | | Experiment | Model | Type | Pretrained Link | Finetuned Link |
70 | |------------|-------|------|-----------------|----------------|
71 | | All | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-all-uncon-7b](https://huggingface.co/LearningOpt/pie-all-uncon-7b) |
72 | | All | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-all-uncon-13b](https://huggingface.co/LearningOpt/pie-all-uncon-13b) |
73 | | HQ | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-hq-selfplay-7b](https://huggingface.co/LearningOpt/pie-hq-selfplay-7b) |
74 | | HQ | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-hq-selfplay-13b](https://huggingface.co/LearningOpt/pie-hq-selfplay-13b) |
75 | | All w/Perf-Cond | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-conditioned-7b](https://huggingface.co/LearningOpt/pie-conditioned-7b) |
76 | | All w/Perf-Cond | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-conditioned-13b](https://huggingface.co/LearningOpt/pie-conditioned-13b) |
77 | | HQ + Self-Play | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-hq-selfplay-7b](https://huggingface.co/LearningOpt/pie-hq-selfplay-7b) |
78 | | HQ + Self-Play | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-hq-selfplay-13b](https://huggingface.co/LearningOpt/pie-hq-selfplay-13b) |
79 |
--------------------------------------------------------------------------------
/scripts/sample_and_eval.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import time
3 | import logging
4 | import sys
5 | import yaml
6 | import shutil
7 | import os
8 |
9 | def start_generation_container(model, volume, max_best_of, port=4242, startup_timeout=600):
10 | # command = f"docker run --detach --gpus all --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
11 | # with 1,2,3,4,5,6,7 gpus
12 | if not model.startswith("codellama"):
13 | model = f"data/{model}"
14 | # the first command may be
15 | command = f"docker run --detach --gpus 1,2,3,4,5,6,7 --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
16 | # use the following line for podman or potentially for a different docker installation, the nvidia-docker command may vary
17 | # command = f"docker run --detach -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
18 | container_id = subprocess.check_output(command, shell=True).decode().strip()
19 | # wait until the logs say Connected
20 | while True:
21 | logging.info(f"Waiting for container to start with id {container_id} and timeout {startup_timeout} left")
22 | logs = subprocess.check_output(f"docker logs {container_id}", shell=True).decode()
23 | if "Connected" in logs:
24 | break
25 | time.sleep(5)
26 | startup_timeout -= 5
27 | if startup_timeout <= 0:
28 | raise TimeoutError("Timeout waiting for container to start")
29 | return container_id
30 |
31 | def stop_generation_container(container_id):
32 | subprocess.run(f"docker stop {container_id}", shell=True)
33 |
34 | def remove_generation_container(container_id):
35 | subprocess.run(f"docker rm {container_id}", shell=True)
36 |
37 |
38 | def sample_from_container(test_file, output_file, do_sample, num_samples=8, max_new_tokens=1000, temperature=0.7, num_threads=20, prompt_name="code_opt"):
39 | logging.info(f"Sampling from container with test_file {test_file} and output_file {output_file}")
40 | command = f"python finetuning/sample.py --test_file {test_file} --output_file {output_file} --do_sample {do_sample} --num_samples {num_samples} --max_new_tokens {max_new_tokens} --temperature {temperature} --num_threads {num_threads} --prompt_name {prompt_name}"
41 | logging.info(f"Running command {command}")
42 | p = subprocess.run(command, shell=True)
43 | logging.info(f"sample.py returned with code {p.returncode}")
44 | return p.returncode
45 |
46 | def run_eval(eval_args):
47 | eval_args["model_generated_outputs_path"] = sampling_args["output_file"]
48 | eval_output_dir = eval_args["output_dir"]
49 | if not os.path.exists(eval_output_dir):
50 | os.makedirs(eval_output_dir)
51 | else:
52 | logging.warning(f"Output directory {eval_output_dir} already exists, overwriting")
53 | with open(os.path.join(eval_output_dir, "config.yaml"), "w") as f:
54 | yaml.dump(eval_args, f)
55 | logging.info(f"Running eval with args {eval_args}")
56 | cmd = f"python gem5/gem5_eval.py --config_path {os.path.join(eval_output_dir, 'config.yaml')}"
57 | logging.info(f"Running command {cmd}")
58 | p = subprocess.run(cmd, shell=True)
59 | logging.info(f"gem5_eval.py returned with code {p.returncode}")
60 | logging.info("Done")
61 |
62 |
63 | def main():
64 | cfg_path = sys.argv[1]
65 | with open(cfg_path, 'r') as f:
66 | cfg = yaml.load(f)
67 | text_gen_args = cfg["text_gen_args"]
68 | sampling_args = cfg["sampling_args"]
69 | eval_args = cfg["eval_args"]
70 |
71 | # Check if the output directory for evaluation exists
72 | if os.path.exists(eval_args['output_dir']):
73 | logging.info(f"Output directory {eval_args['output_dir']} already exists. Skipping the entire script.")
74 | return
75 |
76 | # Check if the output file from sampling exists
77 | if os.path.exists(sampling_args['output_file']):
78 | logging.info(f"Output file {sampling_args['output_file']} from sampling already exists. Skipping container startup and sampling.")
79 | else:
80 | # Start the container and perform sampling
81 | logging.info(f"Starting generation container with args {text_gen_args}")
82 | container_id = start_generation_container(text_gen_args["generation_model_name"], text_gen_args["volume_mount"], text_gen_args["max_best_of"], port=text_gen_args["port"])
83 | logging.info(f"Sampling from container with args {sampling_args}")
84 | sample_from_container(**sampling_args)
85 | # Stop and remove the container
86 | logging.info(f"Stopping container with id {container_id}")
87 | stop_generation_container(container_id)
88 | logging.info(f"Removing container with id {container_id}")
89 | remove_generation_container(container_id)
90 | logging.info("Successfully removed container")
91 |
92 | # Run evaluation
93 | logging.info(f"Setting model_generated_outputs_path to {sampling_args['output_file']} and running eval with args {eval_args}")
94 | run_eval(eval_args)
95 |
96 |
97 | if __name__ == "__main__":
98 | main()
99 |
100 |
101 |
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/gem5/README.md:
--------------------------------------------------------------------------------
1 | # Gem5 Simulator for PIE
2 |
3 | ## Overview
4 |
5 | This subdirectory contains the `gem5` module, which we use to interface with the `gem5` simulator. The `gem5` simulator is a full systema and CPU simulator that can be used to simulate the execution of a program on a computer system. We use `gem5` to simulate the execution of the programs in a determinstic and reproducible manner.
6 |
7 | For our experiments, we use a simulated CPU of the Intel Skylake CPU.
8 | We provide an easy-to-use docker image and API that can be used to reproduce our results and for other researchers to continue to use for program optimization research.
9 |
10 | Building the environment is similar to the [gym](https://github.com/Farama-Foundation/Gymnasium) API for reinforcement learning. After importing the module and running make, the docker image should automatically be pulled on the first iteration and a container created. The environment then provides a convenient abstraction for interacting with the environment.
11 |
12 | Results from our experiments can be located in [this google drive folder](https://drive.google.com/drive/folders/1criq4bpLlIaINzhjUAB18NZwDtEkk0Rj?usp=sharing).
13 |
14 |
15 |
16 | ## Usage
17 | \***********************************************************************************************************************************
18 |
19 | **Note that in order to use the module and its container for simulation, your architecture will need to be either x86-64 or Amd64**
20 |
21 | \***********************************************************************************************************************************
22 |
23 | First you need to configure the pie project as part of your python path. You can do this by running the following command from the root of the pie project:
24 |
25 | ```bash
26 | export PYTHONPATH=$PYTHONPATH:$(pwd)
27 | ```
28 |
29 | On your system you will need to have docker installed. The module works using the Docker Python SDK and is designed to abstract away all the hassle of pulling the container and configuring the gem5 simulator. We have designed it to reflect the OpenAI Gym API, so it should be easy to use for anyone familiar with that.
30 |
31 | ```python
32 |
33 | from gem5 import simulator
34 | env = simulator.make(...)
35 | results = env.submit_multiple_single_submissions(...)
36 |
37 | ```
38 |
39 | In order to get started you will need the simulator.make() function to create an environment object which you can then use to submit to the simulator backend.
40 |
41 | #### Key Arguments for simulator.make()
42 |
43 | - `arch`: The architecture to use. Currently only 'X86-skylake' is supported.
44 | - `cpuset_cpus`: The cpus to use. If not specified, all cpus are used.
45 | - `workers`: The number of workers to use. If not specified, all cpus are used.
46 | - `gem5_acc_threshold`: If the functional accuracy is below this threshold, we skip any benchmarking and return the result early.
47 | - `port`: The port to use for communication.
48 | - `optimization_flag`: The GCC optimization flag to use for compilation, for our work we used '-O3'.
49 | - `cpu_type`: The type of CPU configuration to use. For our work we used 'Verbatim' from the skylake configuration used.
50 | - `timeout_seconds_gem5`: The timeout in seconds for the gem5 simulator, for our work we used 120 seconds for evaluation.
51 | - `verbose`: We highly recommend setting this to True to monitor the progress of the gem5 simulator.
52 | - `exit_early_on_fail`: If True, we exit early if any individual test case times out or encounters a runtime error, we highly recommend this to be set to True for speeding things up if you're only evaluating, as we that would not contribute to any speedups.
53 |
54 | #### Key Arguments for env.submit_multiple_single_submissions()
55 |
56 | - `code_list`: A list of strings, each string is the code of a single submission.
57 | - `testcases_list`: Each sublist consists of the test cases used for benchmarking the corresponding code: these are the integer indices of the test cases in the test case pool.
58 | - `problem_id_list`: A list of strings, each string is the problem id for the corresponding code.
59 | - `timing_env`: The timing environment to use: currently only 'gem5' is supported, we have prototype support for hardware based benchmarking on your machine using 'hyperfine' or 'both' but the 'hyperfine' support is not fully implemented yet.
60 |
61 | ## Evaluation Script
62 |
63 | The evaluation driver is located in `gem5/gem5_eval.py`. This script requires a yaml configuration file to be passed in as an argument to `--config_path`. Example usage from the project directory would be:
64 |
65 | ```bash
66 | export PYTHONPATH=$PYTHONPATH:$(pwd)
67 | python gem5/gem5_eval.py --config_path PATH_TO_EXPERIMENT_CONFIG.yaml
68 | ```
69 |
70 | The yaml configuration file should contain at least the following fields:
71 |
72 | - `model_generated_outputs_path`: The path to the model generated outputs. This should be a `.jsonl` file containing the model generated outputs in addition to all other metadata in the test set file.
73 | - `output_dir`: The directory to output the results to.
74 | - `reference_file_path`: The path to the reference file. This should be the reference `.jsonl` file containing the reference outputs in addition to all other metadata in the test set file.
75 | - `model_generated_potentially_faster_code_col`: The column in the model generated outputs that contains the model's generations of potentially faster code. We've used "generated_answers" as a default.
76 |
77 | An example is provided in [gem5/template_config.yaml](template_config.yaml).
78 |
--------------------------------------------------------------------------------
/openai_finetuning/pie_chatgpt.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import openai
3 | import random
4 | import tiktoken
5 | from tqdm import tqdm
6 | from typing import List
7 | from concurrent.futures import ThreadPoolExecutor
8 |
9 | random.seed(42)
10 |
11 |
12 | def retry_with_exponential_backoff(
13 | func,
14 | initial_delay: float = 1,
15 | exponential_base: float = 2,
16 | jitter: bool = True,
17 | max_retries: int = 10,
18 | errors: tuple = (
19 | openai.error.RateLimitError,
20 | openai.error.ServiceUnavailableError,
21 | ),
22 | ):
23 | """Retry a function with exponential backoff."""
24 |
25 | def wrapper(*args, **kwargs):
26 | # Initialize variables
27 | num_retries = 0
28 | delay = initial_delay
29 |
30 | # Loop until a successful response or max_retries is hit or an exception is raised
31 | while True:
32 | try:
33 | return func(*args, **kwargs)
34 |
35 | # Retry on specified errors
36 | except errors as e:
37 | # Increment retries
38 | num_retries += 1
39 |
40 | # Check if max retries has been reached
41 | if num_retries > max_retries:
42 | raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
43 |
44 | # Increment the delay
45 | delay *= exponential_base * (1 + jitter * random.random())
46 |
47 | # Sleep for the delay
48 | time.sleep(delay)
49 | print(f"\nRetrying after {delay:.2f} seconds.")
50 |
51 | # Raise exceptions for any errors not specified
52 | except Exception as e:
53 | raise e
54 |
55 | return wrapper
56 |
57 |
58 | class ChatGPTWrapper:
59 | """A Wrapper for ChatGPT model interaction."""
60 |
61 | @staticmethod
62 | def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
63 | """
64 | Calculate the number of tokens in a text string.
65 |
66 | Args:
67 | - string (str): The text string to be tokenized.
68 | - encoding_name (str, optional): The encoding name for tokenization. Defaults to "cl100k_base".
69 | Returns:
70 | - int: Number of tokens in the string.
71 | """
72 | encoding = tiktoken.get_encoding(encoding_name)
73 | num_tokens = len(encoding.encode(string))
74 | return num_tokens
75 |
76 | @staticmethod
77 | @retry_with_exponential_backoff
78 | def call_openai_api(
79 | slow_code_str: str, max_tokens: int = 1024, temperature: float = 0.0
80 | ) -> str:
81 | """
82 | Calls the OpenAI API to optimize a given code.
83 |
84 | Args:
85 | - slow_code_str (str): The code string that needs to be optimized.
86 |
87 | - max_tokens (int, optional): The maximum number of tokens to be used for generation. Defaults to 1024.
88 |
89 | - temperature (float, optional): The temperature value for generation. Defaults to 0.0.
90 |
91 | Returns:
92 | - str: Optimized code returned by the OpenAI API.
93 | """
94 | # Initialize the chat log with system and user inputs
95 | start_chat_log = [
96 | {"role": "system", "content": "You are a helpful assistant that can optimize code."},
97 | {"role": "user", "content": ChatGPTWrapper.prepare_input(slow_code_str)},
98 | ]
99 | # Call the OpenAI API with the given chat log
100 | response = openai.ChatCompletion.create(
101 | model="gpt-3.5-turbo-0613",
102 | messages=start_chat_log,
103 | max_tokens=max_tokens,
104 | temperature=temperature,
105 | )
106 | # Extract the optimized code from the response
107 | return response["choices"][0]["message"]["content"]
108 |
109 | @staticmethod
110 | def prepare_input(slow_code_str: str) -> str:
111 | """
112 | Prepares the input for the OpenAI API by framing the code to be optimized.
113 |
114 | Args:
115 | - slow_code_str (str): The code string that needs to be framed for optimization.
116 |
117 | Returns:
118 | - str: Formatted input for the OpenAI API.
119 | """
120 | prompt = f"""// slower version::
121 |
122 | {slow_code_str}
123 |
124 | // optimized version of the same code:
125 |
126 | """
127 | return prompt
128 |
129 |
130 | QUESTION_PREFIX = "# slower version:\n\n"
131 | ANSWER_PREFIX = "# optimized version of the same code:\n\n"
132 |
133 |
134 |
135 | def main(input_file: str, output_file: str):
136 | # Read the jsonl file using pandas
137 | df = pd.read_json(input_file, lines=True)
138 |
139 | # Ensure src_code is in the dataframe
140 | if 'src_code' not in df.columns:
141 | raise ValueError("'src_code' column not found in the input file.")
142 |
143 | # Optimize code using multiple threads
144 | df['optimized_code'] = optimize_code_parallel(df['src_code'].tolist())
145 |
146 | # Save the dataframe to a new jsonl file
147 | df.to_json(output_file, orient='records', lines=True)
148 |
149 |
150 | def optimize_code_parallel(code_list: List[str], max_workers: int = 5) -> List[str]:
151 | """
152 | Function to optimize code using multiple threads.
153 |
154 | Args:
155 | - code_list (List[str]): List of code strings to optimize.
156 | - max_workers (int): Number of worker threads.
157 |
158 | Returns:
159 | - List[str]: List of optimized code strings.
160 | """
161 | with ThreadPoolExecutor(max_workers=max_workers) as executor:
162 | optimized_code_list = list(tqdm(executor.map(ChatGPTWrapper.call_openai_api, code_list), total=len(code_list)))
163 | return optimized_code_list
164 |
165 | if __name__ == "__main__":
166 | import sys
167 |
168 | if len(sys.argv) != 3:
169 | print("Usage: python pie_chatgpt.py ")
170 | sys.exit(1)
171 | main(input_file=sys.argv[1], output_file=sys.argv[2])
--------------------------------------------------------------------------------
/finetuning/finetune.py:
--------------------------------------------------------------------------------
1 | """
2 | Finetune the model on the codellama dataset
3 |
4 | Code adapted from the alpaca-lora repository at https://github.com/tloen/alpaca-lora/blob/main/finetune.py
5 | """
6 |
7 | import os
8 | import sys
9 | from typing import List
10 |
11 | import fire
12 | import torch
13 | import transformers
14 | from datasets import load_dataset
15 |
16 | """
17 | Unused imports:
18 | import torch.nn as nn
19 | import bitsandbytes as bnb
20 | """
21 |
22 | from transformers import AutoModelForCausalLM, CodeLlamaTokenizer
23 |
24 | from utils.prompter import Prompter
25 |
26 |
27 | def train(
28 | # model/data params
29 | base_model: str = "codellama/CodeLlama-13b-hf",
30 | data_path: str = "data/code_data",
31 | output_dir: str = "./code_opt/13b-test",
32 | # training hyperparams
33 | batch_size: int = 128,
34 | micro_batch_size: int = 2,
35 | num_epochs: int = 3,
36 | learning_rate: float = 3e-4,
37 | cutoff_len: int = 1024,
38 | val_set_size: int = 2000,
39 | train_on_inputs: bool = True, # if False, masks out inputs in loss
40 | add_eos_token: bool = True,
41 | group_by_length: bool = False,
42 | # wandb params
43 | wandb_project: str = "code-llama",
44 | wandb_run_name: str = "",
45 | wandb_watch: str = "", # options: false | gradients | all
46 | wandb_log_model: str = "", # options: false | true
47 | resume_from_checkpoint: str = None, # either training checkpoint or final adapter
48 | prompt_template_name: str = "code_opt", # The prompt template to use, will default to code_opt.
49 | use_flash_attention = True,
50 | use_wandb: bool = True, # if True, will use wandb if wandb_project is set
51 | # training data and prompt template
52 | train_name: str = "train.jsonl",
53 | val_name: str = "val.jsonl",
54 | test_name: str = "test.jsonl",
55 | with_speedup_desc: bool = False, # if True, we use templates/code_opt_w_speedup_desc.json
56 | with_speedup_bin: bool = False, # if True, we use templates/code_opt_w_speedup_bin.json
57 | with_pctile: bool = False, # if True, we use templates/code_opt_w_speedup_pctile.json
58 | ):
59 | if with_speedup_desc and with_speedup_bin:
60 | raise ValueError("Both with_speedup_desc and with_speedup_bin can not be TRUE!!!")
61 | if int(os.environ.get("LOCAL_RANK", 0)) == 0:
62 | print(
63 | f"Training code_opt-LoRA model with params:\n"
64 | f"base_model: {base_model}\n"
65 | f"data_path: {data_path}\n"
66 | f"output_dir: {output_dir}\n"
67 | f"batch_size: {batch_size}\n"
68 | f"micro_batch_size: {micro_batch_size}\n"
69 | f"num_epochs: {num_epochs}\n"
70 | f"learning_rate: {learning_rate}\n"
71 | f"cutoff_len: {cutoff_len}\n"
72 | f"val_set_size: {val_set_size}\n"
73 | f"train_on_inputs: {train_on_inputs}\n"
74 | f"add_eos_token: {add_eos_token}\n"
75 | f"group_by_length: {group_by_length}\n"
76 | f"wandb_project: {wandb_project}\n"
77 | f"wandb_run_name: {wandb_run_name}\n"
78 | f"wandb_watch: {wandb_watch}\n"
79 | f"wandb_log_model: {wandb_log_model}\n"
80 | f"resume_from_checkpoint: {resume_from_checkpoint or False}\n"
81 | f"prompt template: {prompt_template_name}\n"
82 | f"Train File: {os.path.join(data_path, train_name)}\n"
83 | f"Val File: {os.path.join(data_path, val_name)}\n"
84 | f"Test File: {os.path.join(data_path, test_name)}\n"
85 | )
86 | assert (
87 | base_model
88 | ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
89 | gradient_accumulation_steps = batch_size // micro_batch_size
90 |
91 | if with_speedup_desc:
92 | prompter = Prompter(template_name="code_opt_w_speedup_desc")
93 | elif with_speedup_bin:
94 | prompter = Prompter(template_name="code_opt_w_speedup_bin")
95 | elif with_pctile:
96 | prompter = Prompter(template_name="code_opt_w_speedup_pctile")
97 | else:
98 | prompter = Prompter(prompt_template_name)
99 |
100 | device_map = "auto"
101 | world_size = int(os.environ.get("WORLD_SIZE", 1))
102 | ddp = world_size != 1
103 | if ddp:
104 | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
105 | gradient_accumulation_steps = gradient_accumulation_steps // world_size
106 |
107 | if use_wandb:
108 | # Check if parameter passed or if set within environ
109 | use_wandb = len(wandb_project) > 0 or (
110 | "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
111 | )
112 | # Only overwrite environ if wandb param passed
113 | if len(wandb_project) > 0:
114 | os.environ["WANDB_PROJECT"] = wandb_project
115 | if len(wandb_watch) > 0:
116 | os.environ["WANDB_WATCH"] = wandb_watch
117 | if len(wandb_log_model) > 0:
118 | os.environ["WANDB_LOG_MODEL"] = wandb_log_model
119 |
120 | # make sure to have latest version of transformers library and flash attention installed
121 | model = AutoModelForCausalLM.from_pretrained(
122 | base_model,
123 | torch_dtype=torch.bfloat16,
124 | device_map=device_map,
125 | attn_implementation="flash_attention_2",
126 | )
127 |
128 | tokenizer = CodeLlamaTokenizer.from_pretrained(base_model)
129 |
130 | tokenizer.pad_token_id = (
131 | 0 # unk.
132 | )
133 | tokenizer.padding_side = "left" # Allow batched inference
134 |
135 | def tokenize(prompt, add_eos_token=True):
136 | result = tokenizer(
137 | prompt,
138 | truncation=True,
139 | max_length=cutoff_len,
140 | padding=False,
141 | return_tensors=None,
142 | )
143 | if (
144 | result["input_ids"][-1] != tokenizer.eos_token_id
145 | and len(result["input_ids"]) < cutoff_len
146 | and add_eos_token
147 | ):
148 | result["input_ids"].append(tokenizer.eos_token_id)
149 | result["attention_mask"].append(1)
150 |
151 | result["labels"] = result["input_ids"].copy()
152 |
153 | return result
154 |
155 | def generate_and_tokenize_prompt(data_point):
156 | full_prompt = prompter.generate_prompt(
157 | data_point["src_code"],
158 | data_point["tgt_code"],
159 | speedup_desc=data_point["speedup_desc"] if with_speedup_desc else None,
160 | speedup_bin=data_point["speedup_bin"] if with_speedup_bin else None,
161 | pctile=data_point["target_reward_updated_pct_bin"] if "target_reward_updated_pct_bin" in data_point else None,
162 | )
163 | tokenized_full_prompt = tokenize(full_prompt)
164 | if not train_on_inputs:
165 | user_prompt = prompter.generate_prompt(
166 | data_point["src_code"],
167 | speedup_desc=data_point["speedup_desc"] if with_speedup_desc else None,
168 | speedup_bin=data_point["speedup_bin"] if with_speedup_bin else None,
169 | pctile=data_point["target_reward_updated_pct_bin"] if "target_reward_updated_pct_bin" in data_point else None,
170 | )
171 | tokenized_user_prompt = tokenize(
172 | user_prompt, add_eos_token=add_eos_token
173 | )
174 | user_prompt_len = len(tokenized_user_prompt["input_ids"])
175 |
176 | if add_eos_token:
177 | user_prompt_len -= 1
178 |
179 | tokenized_full_prompt["labels"] = [
180 | -100
181 | ] * user_prompt_len + tokenized_full_prompt["labels"][
182 | user_prompt_len:
183 | ] # could be sped up, probably
184 | return tokenized_full_prompt
185 |
186 | ## Loading data
187 |
188 | datafiles = {'train': f'{data_path}//{train_name}', 'test': f'{data_path}//{test_name}', 'validation': f'{data_path}//{val_name}'}
189 |
190 | data = load_dataset("json", data_files=datafiles)
191 |
192 | print(f"Is training on inputs: {train_on_inputs}")
193 |
194 | train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
195 | val_data = data["validation"].shuffle().map(generate_and_tokenize_prompt)
196 |
197 | # model.print_trainable_parameters() # Be more transparent about the % of trainable params.
198 |
199 | if not ddp and torch.cuda.device_count() > 1:
200 | # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
201 | model.is_parallelizable = True
202 | model.model_parallel = True
203 |
204 | trainer = transformers.Trainer(
205 | model=model,
206 | train_dataset=train_data,
207 | eval_dataset=val_data,
208 | args=transformers.TrainingArguments(
209 | per_device_train_batch_size=micro_batch_size,
210 | gradient_accumulation_steps=gradient_accumulation_steps,
211 | warmup_steps=100,
212 | num_train_epochs=num_epochs,
213 | learning_rate=learning_rate,
214 | bf16=True,
215 | logging_steps=1,
216 | optim="adamw_torch",
217 | evaluation_strategy="steps" if val_set_size > 0 else "no",
218 | save_strategy="steps",
219 | eval_steps=50 if val_set_size > 0 else None,
220 | save_steps=50,
221 | output_dir=output_dir,
222 | save_total_limit=10,
223 | load_best_model_at_end=True if val_set_size > 0 else False,
224 | ddp_find_unused_parameters=False if ddp else None,
225 | group_by_length=group_by_length,
226 | report_to="wandb" if use_wandb else None,
227 | run_name=wandb_run_name if use_wandb else "none",
228 | fsdp=["full_shard", "auto_wrap"],
229 | gradient_checkpointing=True,
230 | resume_from_checkpoint=f"{output_dir}" if resume_from_checkpoint else None,
231 | ),
232 | data_collator=transformers.DataCollatorForSeq2Seq(
233 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
234 | ),
235 | )
236 | model.config.use_cache = False
237 |
238 | if torch.__version__ >= "2" and sys.platform != "win32":
239 | model = torch.compile(model)
240 |
241 | trainer.train(resume_from_checkpoint=resume_from_checkpoint)
242 |
243 | model.save_pretrained(output_dir, max_shard_size="100GB") # save in 1 shard to work with tgi docker image, have bugs with multiple shards
244 |
245 | print(
246 | "\n If there's a warning about missing keys above, please disregard :)"
247 | )
248 |
249 |
250 | if __name__ == "__main__":
251 | fire.Fire(train)
252 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learning Performance-Improving Code Edits
2 |
3 | Repository for *Learning Performance-Improving Code Edits* ([paper](https://openreview.net/forum?id=ix7rLVHXyY), [website](https://pie4perf.com/)).
4 |
5 | 🚨 Benchmarking programs is easy; but benchmarking programs in a reproducible and deterministic manner is very hard.
6 |
7 | 🚨 LLMs are not great at program optimization out-of-the-box (at least for competitive programming problems)
8 |
9 |
10 | We perform extensive experiments to evaluate and improve Large Language Models (LLMs) for program optimization. We built a custom evaluation framework that benchmarks program execution time in a highly-reliable manner and we provide a dataset annotated with execution time information from our environment.
11 |
12 | When measuring average program speedup, we obtained a fine-tuned version of CodeLlama13B that outperforms GPT4 and the best human programmer. Using self-play for program optimization, we also obtain a fine-tuned version of GPT3.5 that is even stronger.
13 |
14 |
15 |
17 |
18 |
19 |
20 |
21 |
23 |
24 |
25 | ## Dataset
26 |
27 | - PIE is based on [IBM CodeNet](https://github.com/IBM/Project_CodeNet). Huge thanks to the authors of CodeNet for making their curated dataset available!
28 |
29 | Our Train/Val/Test splits are located [here](https://drive.google.com/drive/folders/1E_yFqM8khN1HAH03OKhjheSlNI4rYTT7?usp=sharing). There is also a `train_with_synthetic.jsonl` file which contains and additional ~1.4K pairs generated via self-play. We also have subsets `train_hq_only.jsonl` and `train_hq_and_synthetic.jsonl` which contain only high-quality pairs and high-quality pairs + synthetic pairs respectively.
30 |
31 | Testcases:
32 |
33 | - [Merged test cases](https://drive.google.com/file/d/1evBDJapwRvCQK6VUCTV8ZE9WG2k3QJQr/view?usp=sharing) containing both public and generated test cases: these test cases were the ones used for experiments in the paper.
34 | - [Public test cases](https://drive.google.com/file/d/1RcUpZMOR8L2xYYWDZx7I0tHFzFgg7COO/view?usp=share_link). These test cases are sourced from IBM CodeNet.
35 | - [Generated test cases](https://drive.google.com/file/d/1migwX4wpED0gDDxn7gS6q55vWeXIDgId/view?usp=drive_link). These test cases are sourced from [alphacode](https://github.com/google-deepmind/code_contests).
36 |
37 | The column `tests` in the jsonl files will contain the indices which should be used for benchmarking models.
38 |
39 | ## Program Benchmarking with gem5
40 |
41 | Benchmarking programs is easy; but benchmarking programs in a reproducible and deterministic manner is very hard.
42 | It is important, because we want to compare the performance of different models on the same set of programs irrespective of
43 | a reserarcher's server configuration. Moreover, you can even wind up in scenarios where you can benchmark the same exact program and accidentally believe one is much faster than the other.
44 |
45 |
46 |
47 | We built a custom evaluation framework that benchmarks program execution time in a highly-reliable manner.
48 | We built an execution sandbox based on the gem5 simulator. Given program termination/a program not timing out, benchmarking results are deterministic.
49 | For our experiments, we use a simulated CPU of the Intel Skylake CPU.
50 | We provide an easy-to-use docker image and API that can be used to reproduce our results and for other researchers to continue to use for program optimization research.
51 |
52 | Building the environment is similar to the [gym](https://github.com/Farama-Foundation/Gymnasium) API for reinforcement learning. After importing the module and running make, the docker image should automatically be pulled on the first iteration and a container created. The environment then provides a convenient abstraction for interacting with the environment. More information is located at [gem5](./gem5/README.md).
53 |
54 | It is possible that on a separate architecture, the gem5 simulator runs slower or faster then when we ran it, so results could be influenced by more-frequent and less-frequent timeouts. Generally this should affect programs on the threshold of timing out,
55 | and it should affect more-aggressive optimizations (often "better" models) less than less-aggressive optimizations.
56 |
57 | ```python
58 | import simulator
59 |
60 | # pulls the image from docker hub if it doesn't exist and sets up a connection with a running container
61 | env = simulator.make(arch='X86-skylake', optimization_flag='-O3')
62 | # example sending a program to benchmark within the environment
63 | gem5_benchmarking_results = env.submit_single_submission(...)
64 | ```
65 | ## Performance-Conditioning
66 |
67 | Programs can typically be written in many ways with different performance profiles. When training a model to predict performance-improving edits with
68 | a large dataset, it may be trained on a mix of large and small improvements, without any information on which improvements are more desirable than others. We introduce performance tags during training by associating each “fast” program with a tag indicating the optimal achievable performance across all solutions in the dataset.
69 |
70 |
71 |
72 | Specifically, the tag indicates how close that program is to peak performance on a binned-scale
73 | {1, 2, . . . , 10}. Then at test time, we prompt the model with a test input and a maximal score tag “10/10”, directing it to generate a highly-optimized solution.
74 |
75 | The performance tags are available for the [training dataset](#dataset) and can be used to train models with performance-conditioning. We also provide our fine-tuning code which adds the prompts during training and inference.
76 |
77 | ## Self-Play
78 |
79 | In an attempt to boost the performance of our models, we also investigate the use of self-play for program optimization as a data augmentation technique. Because there is a limited set of programs in our dataset, we use an off-the-shelf language model to generate new programs and a high-quality fine-tuned model to generate new optimizations. After taking some rigorous steps to ensure the generated programs are semantically novel and the optimizaitons are non-trivial, we use the generated programs and optimizations to create new program optimization pairs.
80 |
81 |
82 | The self-play notion comes from the fact that one model is used to generate the programs to solve and the other model is used to generate solve/propose the optimizations.
83 |
84 |
85 |
86 |
87 |
88 | Our best model without self-play was with GPT3.5 turbo, our best fine-tuned model was trained with 4,085 high quality pairs. We were able to sample 3,314 novel programs and obtain 1,485 high-quality optimizations.
89 |
90 | Using these additional 1,485 optimizations helped improve the performance of our fine-tuned model. We also performed an ablation by adding 1,485 next-best programs from the PIE dataset for fine-tuning GPT3.5 turbo, but these pairs led to performance degradation.
91 |
92 |
93 | We provide our scripts for [sampling programs and detecting semantic duplicates](./data_augmentation/data_augmentation_driver_final.sh) and the [self-play data itself](#dataset).
94 |
95 |
96 | # Running Experiments
97 |
98 | ## Finetuning Open Source Models
99 |
100 | We provide a docker image at ```yimengzeng/pie:torch201``` which contains all of the dependencies for finetuning the model, you can also refer to ```docker/Dockerfile``` for the specific packages required to replicate the environment.
101 |
102 | To finetune codellama with the entire PIE dataset and the non-performance-conditioned prompt, run
103 | ```bash
104 | bash finetuning/train.sh
105 | ```
106 | To finetune codellama with the performance-conditioned prompt, change the ```--prompt_template_name``` flag to ```"code_opt_w_speedup_pctile"``` More details are located in the ```finetuning``` directory.
107 |
108 | ## Finetuning OpenAI Models
109 |
110 | The script `openai_finetuning/finetune_openai.py` was used to finetune GPT3.5 Turbo. Its usage is as follows:
111 |
112 | ```bash
113 | python finetune_openai.py PATH_TO_CONFIG.yaml
114 | ```
115 |
116 | More details and an example config file are located in the `openai_finetuning` directory.
117 |
118 | ## Dynamic Retrieval
119 |
120 | A notebook that can be used to prepare the retrieval dataset is `retrieval/retrieval.ipynb`. Given a training dataset and the test set examples to optimize, it will retrieve the K most similar training examples pairs for the given test set examples. The retrieved pairs are then used to prompt the model for optimized outputs.
121 |
122 | ## Sampling from Models
123 |
124 | To generate prompts for the models, please follow details in the paper. Additional utilities for constructing prompts are located in `finetinung/templates` and the `funetuning/utils/prompter.py` module which constructs prompts.
125 |
126 | Samples from our fine-tuned models are located [here](https://drive.google.com/drive/folders/1criq4bpLlIaINzhjUAB18NZwDtEkk0Rj?usp=sharing).
127 |
128 | #### Sampling from Open Source Models
129 | To sample optimized programs using the finetuned model with the ```text-generation-inference``` tool, first replace the ```PATH_TO_MODEL``` field to the acutal path of the finetuned model in ```server.sh```, and then to serve the model, run
130 | ```bash
131 | bash finetuning/server.sh
132 | ```
133 |
134 | To sample from the model just served with default parameters as in the paper, run
135 | ```bash
136 | bash finetuning/sample.sh
137 | ```
138 |
139 | More details are located in the ```finetuning``` directory.
140 |
141 | #### Sampling from OpenAI
142 |
143 | We used [prompt-lib](https://github.com/reasoning-machines/prompt-lib/tree/main) to sample from OpenAI's endpoints.
144 |
145 | ## Self-Play Experiments
146 |
147 | The directory `data_augmentation` contains the scripts used to sample and filter out novel competitive programming problems for PIE.
148 |
149 | Running ``data_augmentation/data_augmentation_driver_final.sh`` contains the final parameters we used to sample the problems. More details are located in the `data_augmentation` directory.
150 |
151 | ## Evaluation
152 |
153 | The evaluation driver is located in `gem5/gem5_eval.py`. This script requires a yaml configuration file to be passed in as an argument to `--config_path`. Example usage from the project directory would be:
154 |
155 | ```bash
156 | export PYTHONPATH=$PYTHONPATH:$(pwd)
157 | python gem5/gem5_eval.py --config_path PATH_TO_EXPERIMENT_CONFIG.yaml
158 | ```
159 |
160 | Results from our experiments can be located in [this google drive folder](https://drive.google.com/drive/folders/1criq4bpLlIaINzhjUAB18NZwDtEkk0Rj?usp=sharing).
161 |
162 | More details are located in the `gem5` directory.
163 |
164 | ----
165 |
166 | ## Citation
167 |
168 | ```
169 | @inproceedings{pie_iclr_2024_spotlight,
170 | title={\href{https://openreview.net/pdf?id=ix7rLVHXyY}{Learning Performance-Improving Code Edits}},
171 | author={Shypula, Alexander and Madaan, Aman and Zeng, Yimeng and Alon, Uri and Gardner, Jacob and Hashemi, Milad and Neubig, Graham and Ranganathan, Parthasarathy and Bastani, Osbert and Yazdanbakhsh, Amir},
172 | booktitle={The Twelfth International Conference on Learning Representations (ICLR)},
173 | year={2024}
174 | }
175 | ```
176 |
177 |
178 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/finetuning/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2024 Alexander G Shypula, Aman Madaan, Yimeng Zeng, Uri Alon, Jacob R. Gardner, Yiming Yang, Milad Hashemi, Graham Neubig, Parthasarathy Ranganathan, Osbert Bastani, Amir Yazdanbakhsh
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/gem5/api_pytest.py:
--------------------------------------------------------------------------------
1 | import benchmarking
2 | import tempfile
3 | import subprocess
4 | import os
5 | import glob
6 | import numpy as np
7 | from tqdm import tqdm
8 | from collections import defaultdict
9 |
10 | count_to_10_cpp = """
11 | #include
12 | using namespace std;
13 |
14 | int main() {
15 | for (int i = 0; i < 10; i++) {
16 | cout << i << endl;
17 | }
18 | return 0;
19 | }
20 | """
21 |
22 | mult_in_by_2_cpp = """
23 | #include
24 | using namespace std;
25 |
26 | int main() {
27 | int x;
28 | cin >> x;
29 | cout << x * 2 << endl;
30 | return 0;
31 | }
32 | """
33 |
34 | example_1_code = """
35 | #include
36 | #define REP(i, n) for (int i = 0; i < (n); i++)
37 | using namespace std;
38 | const int MOD = 998244353;
39 |
40 | int main() {
41 | cin.tie(0)->sync_with_stdio(false);
42 |
43 | int n, k; cin >> n >> k;
44 | vector l(k), r(k);
45 | REP(i, k) cin >> l[i] >> r[i];
46 | REP(i, k) r[i]++;
47 |
48 | vector dp(n + 1, 0);
49 | dp[0] = 1;
50 | dp[1] = -1;
51 | REP(i, n) {
52 | if (i > 0)
53 | dp[i] = (dp[i] + dp[i - 1]) % MOD;
54 | REP(j, k) {
55 | if (i + l[j] < n)
56 | dp[i + l[j]] = (dp[i + l[j]] + dp[i]) % MOD;
57 | if (i + r[j] < n)
58 | dp[i + r[j]] = (((dp[i + r[j]] - dp[i]) % MOD) + MOD) % MOD;
59 | }
60 | }
61 | cout << dp[n - 1] << endl;
62 | return 0;
63 | }
64 | """
65 | example_1_problem_id = "p02549"
66 |
67 | example_hello_world_code = """
68 | #include
69 |
70 | int main() {
71 | std::cout << "Hello, World!" << std::endl;
72 | return 0;
73 | }
74 | """
75 |
76 | # def exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout):
77 | # logging.info(f'executing {bin_path}, with input {in_path}')
78 | # with open(in_path, 'r') as fh:
79 | # p = subprocess.run([bin_path], capture_output=True, timeout=timeout, stdin=fh, text=True)
80 | # if p.returncode != 0:
81 | # raise Exception(f"Error executing code: {bin_path}, return code: {p.returncode}, stderr: {p.stderr.decode('utf-8')}")
82 | # else:
83 | # return get_accuracy(p.stdout, ground_truth_output)
84 |
85 |
86 | class TestBenchmarking:
87 | def test_compile(self):
88 | with tempfile.TemporaryDirectory() as tmpdir:
89 | code_path = os.path.join(tmpdir, "basic.cpp")
90 | with open(code_path, "w") as f:
91 | f.write(count_to_10_cpp)
92 | output_path = benchmarking.compile_cpp_code(code_path)
93 | p = subprocess.run([output_path], capture_output=True, text=True)
94 | assert p.returncode == 0
95 | assert p.stdout.strip() == "\n".join([str(i) for i in range(10)])
96 | assert os.path.exists(output_path)
97 | assert os.path.getsize(output_path) > 0
98 |
99 | def test_exec_bin(self):
100 | with tempfile.TemporaryDirectory() as tmpdir:
101 | code_path = os.path.join(tmpdir, "basic.cpp")
102 | with open(code_path, "w") as f:
103 | f.write(count_to_10_cpp)
104 | output_path = benchmarking.compile_cpp_code(code_path)
105 | rc, stdout, stderr = benchmarking.exec_bin(output_path, None, None)
106 | assert rc == 0
107 | assert stdout.strip() == "\n".join([str(i) for i in range(10)])
108 | assert stderr == ""
109 |
110 | def test_exec_bin_input(self):
111 | with tempfile.TemporaryDirectory() as tmpdir:
112 | code_path = os.path.join(tmpdir, "basic.cpp")
113 | input_path = os.path.join(tmpdir, "input.txt")
114 | with open(code_path, "w") as f:
115 | f.write(mult_in_by_2_cpp)
116 | with open(input_path, "w") as f:
117 | f.write("2")
118 | output_path = benchmarking.compile_cpp_code(code_path)
119 | rc, stdout, stderr = benchmarking.exec_bin(output_path, input_path, None)
120 | assert rc == 0
121 | assert stdout.strip() == "4"
122 | assert stderr == ""
123 |
124 | def test_exec_bin_for_acc(self):
125 | with tempfile.TemporaryDirectory() as tmpdir:
126 | code_path = os.path.join(tmpdir, "basic.cpp")
127 | input_path = os.path.join(tmpdir, "input.txt")
128 | with open(code_path, "w") as f:
129 | f.write(mult_in_by_2_cpp)
130 | with open(input_path, "w") as f:
131 | f.write("2")
132 | output_path = benchmarking.compile_cpp_code(code_path)
133 | acc_correct = benchmarking.exec_bin_for_acc(output_path, input_path, "4", None)
134 | acc_incorrect = benchmarking.exec_bin_for_acc(output_path, input_path, "5", None)
135 | assert acc_correct == 1
136 | assert acc_incorrect == 0
137 |
138 | def test_compile_and_check_outputs(self):
139 | with tempfile.TemporaryDirectory() as tempdir:
140 | code_path = os.path.join(tempdir, "basic.cpp")
141 | with open(code_path, "w") as fh:
142 | fh.write(example_1_code)
143 | bin_path, accs = benchmarking.compile_and_check_outputs(
144 | code_path=code_path,
145 | problem_id=example_1_problem_id,
146 | testcases_dir="/home/pie-perf/data/codenet/merged_test_cases/"
147 | )
148 | print(f"bin_path: {bin_path}")
149 | assert os.path.exists(bin_path)
150 | assert os.path.getsize(bin_path) > 0
151 | assert np.mean(list(accs.values())) == 1.0
152 | assert np.std(list(accs.values())) == 0.0
153 | n_testcases = len(glob.glob(os.path.join("/home/pie-perf/data/codenet/merged_test_cases/", example_1_problem_id, "input.*.txt")))
154 | assert len(accs) == n_testcases
155 |
156 | def test_exec_gem5(self):
157 | sim_seconds = []
158 | sim_seconds_precise = []
159 | for _ in tqdm(range(5)):
160 | with tempfile.TemporaryDirectory() as tmpdir:
161 | code_path = os.path.join(tmpdir, "basic.cpp")
162 | with open(code_path, "w") as f:
163 | f.write(example_hello_world_code)
164 | output_path = benchmarking.compile_cpp_code(code_path, cflags="--std=c++17 -O3")
165 | rc, stdout, stderr = benchmarking.exec_gem5(
166 | gem5_dir="/home/gem5/build/X86/",
167 | gem5_script_path="/home/gem5-skylake-config/gem5-configs/run-se.py",
168 | cpu_type="Verbatim",
169 | bin_path=output_path,
170 | in_path=None,
171 | stats_out_path=os.path.join(tmpdir, "stats.txt"),
172 | timeout=60,
173 | cpu_number=0)
174 |
175 | assert rc == 0
176 | stats = benchmarking.parse_stats_txt(os.path.join(tmpdir, "stats.txt"))
177 | sim_seconds.append(stats["sim_seconds"])
178 | sim_seconds_precise.append(stats["sim_seconds_precise"])
179 | print(f"sim_seconds: {sim_seconds}")
180 | print(f"sim_seconds_precise: {sim_seconds_precise}")
181 | assert np.isclose(np.mean(sim_seconds), 0.001004, atol=1e-5)
182 | assert np.isclose(np.mean(sim_seconds_precise), 0.001004, atol=1e-5)
183 | assert all(sim_seconds_precise[i] == 0.001004121118 for i in range(len(sim_seconds_precise)))
184 |
185 | def test_run_gem5(self):
186 | sim_seconds_0 = []
187 | sim_seconds_1 = []
188 | for _ in tqdm(range(2)):
189 | with tempfile.TemporaryDirectory() as tmpdir:
190 | code_path = os.path.join(tmpdir, "code.cpp")
191 | with open(code_path, "w") as f:
192 | f.write(example_1_code)
193 | bin_path = benchmarking.compile_cpp_code(code_path)
194 | tc_2_results = benchmarking.run_gem5(
195 | gem5_dir="/home/gem5/build/X86/",
196 | gem5_script_path="/home/gem5-skylake-config/gem5-configs/run-se.py",
197 | cpu_type="Verbatim",
198 | bin_path=bin_path,
199 | problem_id=example_1_problem_id,
200 | testcases_dir="/home/pie-perf/data/codenet/merged_test_cases/",
201 | testcases=[0,1],
202 | timeout=30,
203 | cpu_number=0
204 | )
205 | assert tc_2_results[0]["success"] == True
206 | assert tc_2_results[1]["success"] == True
207 | assert len(tc_2_results) == 2
208 | sim_seconds_0.append(tc_2_results[0]["stats"]["sim_seconds_precise"])
209 | sim_seconds_1.append(tc_2_results[1]["stats"]["sim_seconds_precise"])
210 | print(f"sim_seconds for tc 0 {sim_seconds_0}")
211 | print(f"sim_seconds for tc 1 {sim_seconds_1}")
212 | assert sim_seconds_0[0] == sim_seconds_0[1] == 0.001035073468
213 | assert sim_seconds_1[0] == sim_seconds_1[1] == 0.001039205596
214 |
215 |
216 | def test_run_hyperfine(self):
217 | tc2times = defaultdict(list)
218 | for _ in range(2):
219 | with tempfile.TemporaryDirectory() as tmpdir:
220 | code_path = os.path.join(tmpdir, "code.cpp")
221 | with open(code_path, "w") as f:
222 | f.write(example_1_code)
223 | code2results, output = benchmarking.run_hyperfine(
224 | code_paths=[code_path],
225 | problem_ids=[example_1_problem_id],
226 | path_to_testcases="/home/pie-perf/data/codenet/merged_test_cases/",
227 | json_out_path=os.path.join(tmpdir, "results.json"),
228 | test_cases_list=[[i for i in range(10)]],
229 | min_runs_per_test_case=10,
230 | max_runs_per_test_case=500,
231 | strict_runs_per_test_case=False,
232 | warmup_runs_per_test_case=5,
233 | cpu_number=0,
234 | do_sanity_check=True)
235 | for tc, results in code2results[code_path].items():
236 | tc2times[tc].append(np.array(results["times"]))
237 | for tc, times in tc2times.items():
238 | mean_times = []
239 | for time_list in times:
240 | mean_times.append(np.mean(time_list))
241 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
242 | print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
243 | assert len(tc2times) == 10
244 |
245 | def test_run_hyperfine_strict(self):
246 | tc2times = defaultdict(list)
247 | for _ in range(2):
248 | with tempfile.TemporaryDirectory() as tmpdir:
249 | code_path = os.path.join(tmpdir, "code.cpp")
250 | with open(code_path, "w") as f:
251 | f.write(example_1_code)
252 | code2results, output = benchmarking.run_hyperfine(
253 | code_paths=[code_path],
254 | problem_ids=[example_1_problem_id],
255 | path_to_testcases="/home/pie-perf/data/codenet/merged_test_cases/",
256 | json_out_path=os.path.join(tmpdir, "results.json"),
257 | test_cases_list=None,
258 | min_runs_per_test_case=100,
259 | max_runs_per_test_case=None,
260 | strict_runs_per_test_case=True,
261 | warmup_runs_per_test_case=5,
262 | cpu_number=0,
263 | do_sanity_check=True)
264 | for tc, results in code2results[code_path].items():
265 | tc2times[tc].append(np.array(results["times"]))
266 | for tc, times in tc2times.items():
267 | assert len(times) == 2
268 | mean_times = []
269 | for time_list in times:
270 | assert len(time_list) == 100
271 | mean_times.append(np.mean(time_list))
272 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
273 | print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
274 | assert len(tc2times) == len(glob.glob(f"/home/pie-perf/data/codenet/merged_test_cases/{example_1_problem_id}/input*"))
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
--------------------------------------------------------------------------------
/openai_finetuning/finetune_openai.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import os
3 | import sys
4 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
5 | import pie_chatgpt
6 | import re
7 | import json
8 | from typing import List, Dict
9 | import yaml
10 | import logging
11 | import shutil
12 | import uuid
13 | import time
14 | import json
15 | import os
16 | from time import sleep
17 | from io import StringIO
18 | import openai
19 |
20 |
21 |
22 | def load_data(train_path, test_path, max_train, max_val):
23 | df_train = pd.read_json(train_path, lines=True, orient='records')
24 | df_train = df_train.sample(frac=1).reset_index(drop=True)
25 | df_train = df_train[:max_train]
26 | df_test = pd.read_json(test_path, lines=True, orient='records')
27 | df_test = df_test.sample(frac=1).reset_index(drop=True)
28 | df_test = df_test[:max_val]
29 | return df_train, df_test
30 |
31 |
32 |
33 | def prepare_output(code_str, max_len=-1, tokenizer=None):
34 | # "\n+" -> "\n"
35 | if max_len > 0 and tokenizer:
36 | code_str = code_str[:max_len]
37 | elif max_len > 0 and not tokenizer:
38 | raise ValueError("max_len > 0 but no tokenizer provided")
39 | return code_str
40 |
41 |
42 | def prepare_dataset(df, src_code_col, tgt_code_col, max_len=-1, tokenizer=None, max_examples=-1):
43 | df = df.copy()
44 | if max_examples > 0:
45 | df = df.sample(frac=1).reset_index(drop=True)
46 | df = df[:max_examples]
47 | training_examples = []
48 | for i, row in df.iterrows():
49 | src_code = row[src_code_col]
50 | src_code_formatted = pie_chatgpt.ChatGPTWrapper.prepare_input(src_code)
51 | tgt_code = row[tgt_code_col]
52 | tgt_code_formatted = prepare_output(tgt_code, max_len=max_len, tokenizer=tokenizer)
53 |
54 | d = [
55 | {"role": "system", "content": "You are a helpful assistant that can optimize code."},
56 | {"role": "user", "content": src_code_formatted},
57 | {"role": "assistant", "content": tgt_code_formatted},
58 | ]
59 | training_examples.append({"messages": d})
60 | return training_examples
61 |
62 |
63 |
64 | def save_dataset(training_examples: List[Dict], file_name: str):
65 | with open(file_name, 'w') as jsonl_file:
66 | for example in training_examples:
67 | jsonl_file.write(json.dumps(example) + '\n')
68 |
69 |
70 | def register_file_openai(file_path, outpath, sleep_interval=30):
71 | logger.info(f"Registering file {file_path} to OpenAI")
72 | file_dict = openai.File.create(
73 | file=open(file_path, "rb"),
74 | purpose='fine-tune',
75 | )
76 | logger.info(f"File registered with id {file_dict['id']}")
77 | while file_dict['status'] != 'processed':
78 | file_dict = openai.File.retrieve(file_dict['id'])
79 | logger.info(f"File status: {file_dict['status']}")
80 | with open(outpath, 'w') as json_file:
81 | json.dump(file_dict, json_file)
82 | if file_dict['status'] != 'processed':
83 | logger.info(f"Sleeping for {sleep_interval} seconds")
84 | sleep(sleep_interval)
85 | return file_dict
86 |
87 |
88 | def main(input_train_path, input_test_path, max_train, max_val, max_len, tokenizer,output_dir, model_suffix="pie_opt", epochs=1):
89 | logging.info(f"Input train path: {input_train_path}; epochs: {epochs}")
90 | if not os.path.exists(output_dir):
91 | os.makedirs(output_dir)
92 | df_train, df_test = load_data(input_train_path, input_test_path, max_train, max_val)
93 | logger.info(f"Loaded {len(df_train)} training examples and {len(df_test)} test examples")
94 | training_examples = prepare_dataset(df_train, "src_code", "tgt_code", max_len=max_len, tokenizer=tokenizer)
95 | if os.path.exists(os.path.join(output_dir, "train.jsonl")):
96 | unique_id = uuid.uuid4()
97 | logger.warning(f"File {os.path.join(output_dir, 'train.jsonl')} already exists, copying to {os.path.join(output_dir, f'train_{unique_id}.jsonl')}")
98 | shutil.copy(os.path.join(output_dir, "train.jsonl"), os.path.join(output_dir, f"train_{unique_id}.jsonl"))
99 | save_dataset(training_examples, os.path.join(output_dir, "train.jsonl"))
100 | training_examples = prepare_dataset(df_test, "src_code", "tgt_code", max_len=max_len, tokenizer=tokenizer)
101 | if os.path.exists(os.path.join(output_dir, "test.jsonl")):
102 | unique_id = uuid.uuid4()
103 | logger.warning(f"File {os.path.join(output_dir, 'test.jsonl')} already exists, copying to {os.path.join(output_dir, f'test_{unique_id}.jsonl')}")
104 | shutil.copy(os.path.join(output_dir, "test.jsonl"), os.path.join(output_dir, f"test_{unique_id}.jsonl"))
105 | save_dataset(training_examples, os.path.join(output_dir, "test.jsonl"))
106 | train_data = register_file_openai(os.path.join(output_dir, "train.jsonl"), os.path.join(output_dir, "openai_train_file.json"))
107 | val_data = register_file_openai(os.path.join(output_dir, "test.jsonl"), os.path.join(output_dir, "openai_val_file.json"))
108 | train_data, val_data = wait_on_data(train_data, val_data)
109 | assert train_data['status'] == 'processed'
110 | assert val_data['status'] == 'processed'
111 | with open(os.path.join(output_dir, "openai_train_file.json"), 'w') as train_json_file, open(os.path.join(output_dir, "openai_val_file.json"), 'w') as val_json_file:
112 | json.dump(train_data, train_json_file)
113 | json.dump(val_data, val_json_file)
114 |
115 | model = openai.FineTuningJob.create(
116 | model = "gpt-3.5-turbo",
117 | training_file = train_data['id'],
118 | validation_file = val_data['id'],
119 | suffix = model_suffix,
120 | hyperparameters = {"n_epochs": epochs}
121 | )
122 | logging.info(f"Model {model['id']} created")
123 | logging.info(f"Model dict: {model}")
124 | monitor_model(model, output_dir)
125 | return model
126 |
127 | def wait_on_data(train_data, val_data, max_timeout = 600, sleep_interval=10):
128 | start = time.time()
129 | while train_data['status'] != 'processed' or val_data['status'] != 'processed':
130 | train_data = openai.File.retrieve(train_data['id'])
131 | val_data = openai.File.retrieve(val_data['id'])
132 | logger.info(f"Train data status: {train_data['status']} status_details: {train_data['status_details']}")
133 | logger.info(f"Val data status: {val_data['status']}, status_details: {val_data['status_details']}")
134 | if time.time() - start > max_timeout:
135 | raise TimeoutError("Timeout waiting for data")
136 | logger.info(f"Sleeping for {sleep_interval} seconds")
137 | sleep(sleep_interval)
138 | return train_data, val_data
139 |
140 |
141 | def get_step_metrics(file_id):
142 | content = openai.File.download(file_id)
143 | eval_result = StringIO(content.decode())
144 | df = pd.read_csv(eval_result, sep=",")
145 | return df
146 |
147 |
148 | def handle_get_step_metrics(file_id, output_dir):
149 | content = openai.File.download(file_id)
150 | eval_result = StringIO(content.decode())
151 | try:
152 | df = pd.read_csv(eval_result, sep=",")
153 | df.to_csv(os.path.join(output_dir, f"success_{file_id}.csv"), index=False)
154 | return df
155 | except Exception as e:
156 | error_message = f"Error reading file {file_id}: {e}\n"
157 | file_content_message = f"File content: {content}\n"
158 | file_content_decoded_message = f"File content decoded: {content.decode()}\n"
159 | eval_result_content_message = f"Eval result content: {eval_result.getvalue()}\n"
160 |
161 | with open(os.path.join(output_dir, f"error_{file_id}.txt"), 'w') as error_file:
162 | error_file.write(error_message)
163 | error_file.write(file_content_message)
164 | error_file.write(file_content_decoded_message)
165 | error_file.write(eval_result_content_message)
166 |
167 | logger.error(error_message)
168 | logger.error(file_content_message)
169 | logger.error(file_content_decoded_message)
170 | logger.error(eval_result_content_message)
171 |
172 | return None
173 |
174 | SAMPLE_CPP_PROGRAM_TO_OPTIMIZE = """
175 | #include
176 | #include
177 | #include
178 | #include
179 |
180 | int main(int argc, char** argv) {
181 | int n = 1000000;
182 | int* a = (int*) malloc(n * sizeof(int));
183 | int* b = (int*) malloc(n * sizeof(int));
184 | int* c = (int*) malloc(n * sizeof(int));
185 | for (int i = 0; i < n; i++) {
186 | a[i] = i;
187 | b[i] = i;
188 | }
189 | for (int i = 0; i < n; i++) {
190 | c[i] = a[i] + b[i];
191 | }
192 | printf("%d", c[0]);
193 | free(a);
194 | free(b);
195 | free(c);
196 | return 0;
197 | }
198 | """
199 |
200 |
201 |
202 |
203 | def monitor_model(model_dict, output_dir, sleep_interval=30):
204 | model = openai.FineTuningJob.retrieve(model_dict['id'])
205 | logger.info(f"Model status: {model['status']}")
206 | while model['status'] != 'succeeded':
207 | model = openai.FineTuningJob.retrieve(model_dict['id'])
208 | logger.info(f"Model status: {model['status']}")
209 | if model['status'] != 'succeeded':
210 | logger.info(f"Sleeping for {sleep_interval} seconds")
211 | if "result_files" in model:
212 | for file_id in model['result_files']:
213 | if file_id != None:
214 | result = openai.File.download(file_id)
215 | with open(os.path.join(output_dir, f"result_{file_id}.csv"), 'wb') as result_file:
216 | result_file.write(result)
217 | logging.info(f"Result file {file_id} saved to {os.path.join(output_dir, f'result_{file_id}.json')}")
218 | try:
219 | df = pd.read_csv(os.path.join(output_dir, f"result_{file_id}.csv"))
220 | last_row = df.iloc[-1]
221 | logger.info(f"Last row: {last_row}")
222 | except Exception as e:
223 | logger.error(f"Error reading file {file_id}: {e}")
224 | logger.error(f"File content: {result}")
225 | logger.error(f"File content decoded: {result.decode()}")
226 |
227 | with open(os.path.join(output_dir, "openai_model.json"), 'w') as json_file:
228 | json.dump(model, json_file)
229 | sleep(sleep_interval)
230 |
231 | if "result_files" in model:
232 | for file_id in model['result_files']:
233 | if file_id is not None:
234 | result = openai.File.download(file_id)
235 | with open(os.path.join(output_dir, f"result_{file_id}.csv"), 'wb') as result_file: # 'wb'
236 | result_file.write(result)
237 | logging.info(f"Result file {file_id} saved to {os.path.join(output_dir, f'result_{file_id}.json')}")
238 |
239 | with open(os.path.join(output_dir, "openai_model.json"), 'w') as json_file:
240 | json.dump(model, json_file)
241 |
242 | # parse the clock time
243 | # finished_at = model['finished_at']
244 | # started_at = model['started_at']
245 | # total_time = finished_at - started_at
246 | finished_at = model.get('finished_at', None)
247 | started_at = model.get('started_at', None)
248 | if finished_at is not None and started_at is not None:
249 | total_time = finished_at - started_at
250 | logging.info(f"Model {model['id']} finished in {total_time / 60} minutes")
251 | if "trained_tokens" in model:
252 | logging.info(f"Model {model['id']} trained tokens: {model['trained_tokens']}")
253 |
254 | logging.info(f"Model {model['id']} fine-tuned model: {model['fine_tuned_model']}")
255 |
256 |
257 | chat_log = [
258 | {"role": "system", "content": "You are a helpful assistant that can optimize code."},
259 | {"role": "user", "content": pie_chatgpt.ChatGPTWrapper.prepare_input(SAMPLE_CPP_PROGRAM_TO_OPTIMIZE)},
260 | ]
261 |
262 | try:
263 | response = openai.ChatCompletion.create(
264 | model=model['fine_tuned_model'],
265 | messages=chat_log,
266 | max_tokens=1000,
267 | temperature=0.0,
268 | )
269 | logging.info(f"************************")
270 | logging.info(f"Input program: {SAMPLE_CPP_PROGRAM_TO_OPTIMIZE}")
271 | logging.info("************************")
272 | logging.info(f"Output program: {response['choices'][0]['message']['content']}")
273 | except Exception as e:
274 | logging.error(f"Error calling OpenAI API: {e}")
275 | logging.error(f"Chat log: {chat_log}")
276 |
277 | return model
278 |
279 |
280 | def load_config(yaml_path):
281 | with open(yaml_path, 'r') as file:
282 | config = yaml.safe_load(file)
283 | return config
284 |
285 |
286 |
287 | if __name__ == "__main__":
288 | import transformers
289 | tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
290 |
291 | if len(sys.argv) > 1:
292 | config_path = sys.argv[1]
293 | else:
294 | raise ValueError("No config path provided")
295 | config = load_config(config_path)
296 |
297 | openai.api_key = config['api_key']
298 | if 'organization' in config and config['organization']:
299 | openai.organization = config['organization']
300 |
301 | assert len(config['model_suffix']) > 0 and len(config['model_suffix']) < 19, "model_suffix must be between 1 and 18 characters"
302 |
303 | logger = logging.getLogger(__name__)
304 | ## log date and time
305 | if not os.path.exists(config['output_dir']):
306 | os.makedirs(config['output_dir'])
307 | logging.basicConfig(
308 | level=logging.INFO,
309 | format='%(asctime)s %(message)s',
310 | handlers=[
311 | logging.FileHandler(os.path.join(config['output_dir'], 'chatgpt_fine_tuning.log')),
312 | logging.StreamHandler()
313 | ]
314 | )
315 |
316 | logging.info(f"Config: {config}")
317 |
318 | main(
319 | input_train_path=config['input_train_path'],
320 | input_test_path=config['input_test_path'],
321 | max_train=config['max_train'],
322 | max_val=config['max_val'],
323 | max_len=config['max_len'],
324 | tokenizer=tokenizer,
325 | output_dir=config['output_dir'],
326 | model_suffix=config['model_suffix'],
327 | epochs=config['epochs']
328 | )
329 |
330 |
--------------------------------------------------------------------------------
/finetuning/utils/convert_to_safetensors.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | import shutil
5 | from collections import defaultdict
6 | from inspect import signature
7 | from tempfile import TemporaryDirectory
8 | from typing import Dict, List, Optional, Set, Tuple
9 |
10 | import torch
11 |
12 | from huggingface_hub import CommitInfo, CommitOperationAdd, Discussion, HfApi, hf_hub_download
13 | from huggingface_hub.file_download import repo_folder_name
14 | from safetensors.torch import load_file, save_file
15 | from transformers import AutoConfig
16 |
17 |
18 | COMMIT_DESCRIPTION = """
19 | This is an automated PR created with https://huggingface.co/spaces/safetensors/convert
20 |
21 | This new file is equivalent to `pytorch_model.bin` but safe in the sense that
22 | no arbitrary code can be put into it.
23 |
24 | These files also happen to load much faster than their pytorch counterpart:
25 | https://colab.research.google.com/github/huggingface/notebooks/blob/main/safetensors_doc/en/speed.ipynb
26 |
27 | The widgets on your model page will run using this model even if this is not merged
28 | making sure the file actually works.
29 |
30 | If you find any issues: please report here: https://huggingface.co/spaces/safetensors/convert/discussions
31 |
32 | Feel free to ignore this PR.
33 | """
34 |
35 | ConversionResult = Tuple[List["CommitOperationAdd"], List[Tuple[str, "Exception"]]]
36 |
37 |
38 | class AlreadyExists(Exception):
39 | pass
40 |
41 |
42 | def shared_pointers(tensors):
43 | ptrs = defaultdict(list)
44 | for k, v in tensors.items():
45 | ptrs[v.data_ptr()].append(k)
46 | failing = []
47 | for ptr, names in ptrs.items():
48 | if len(names) > 1:
49 | failing.append(names)
50 | return failing
51 |
52 |
53 | def check_file_size(sf_filename: str, pt_filename: str):
54 | sf_size = os.stat(sf_filename).st_size
55 | pt_size = os.stat(pt_filename).st_size
56 |
57 | if (sf_size - pt_size) / pt_size > 0.01:
58 | raise RuntimeError(
59 | f"""The file size different is more than 1%:
60 | - {sf_filename}: {sf_size}
61 | - {pt_filename}: {pt_size}
62 | """
63 | )
64 |
65 |
66 | def rename(pt_filename: str) -> str:
67 | filename, ext = os.path.splitext(pt_filename)
68 | local = f"{filename}.safetensors"
69 | local = local.replace("pytorch_model", "model")
70 | return local
71 |
72 |
73 | def convert_multi(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
74 | filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json", token=token, cache_dir=folder)
75 | with open(filename, "r") as f:
76 | data = json.load(f)
77 |
78 | filenames = set(data["weight_map"].values())
79 | local_filenames = []
80 | for filename in filenames:
81 | pt_filename = hf_hub_download(repo_id=model_id, filename=filename, token=token, cache_dir=folder)
82 |
83 | sf_filename = rename(pt_filename)
84 | sf_filename = os.path.join(folder, sf_filename)
85 | convert_file(pt_filename, sf_filename)
86 | local_filenames.append(sf_filename)
87 |
88 | index = os.path.join(folder, "model.safetensors.index.json")
89 | with open(index, "w") as f:
90 | newdata = {k: v for k, v in data.items()}
91 | newmap = {k: rename(v) for k, v in data["weight_map"].items()}
92 | newdata["weight_map"] = newmap
93 | json.dump(newdata, f, indent=4)
94 | local_filenames.append(index)
95 |
96 | operations = [
97 | CommitOperationAdd(path_in_repo=local.split("/")[-1], path_or_fileobj=local) for local in local_filenames
98 | ]
99 | errors: List[Tuple[str, "Exception"]] = []
100 |
101 | return operations, errors
102 |
103 |
104 | def convert_single(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
105 | pt_filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin", token=token, cache_dir=folder)
106 |
107 | sf_name = "model.safetensors"
108 | sf_filename = os.path.join(folder, sf_name)
109 | convert_file(pt_filename, sf_filename)
110 | operations = [CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=sf_filename)]
111 | errors: List[Tuple[str, "Exception"]] = []
112 | return operations, errors
113 |
114 |
115 | def convert_file(
116 | pt_filename: str,
117 | sf_filename: str,
118 | ):
119 | loaded = torch.load(pt_filename, map_location="cpu")
120 | if "state_dict" in loaded:
121 | loaded = loaded["state_dict"]
122 | shared = shared_pointers(loaded)
123 | for shared_weights in shared:
124 | for name in shared_weights[1:]:
125 | loaded.pop(name)
126 |
127 | # For tensors to be contiguous
128 | loaded = {k: v.contiguous() for k, v in loaded.items()}
129 |
130 | dirname = os.path.dirname(sf_filename)
131 | os.makedirs(dirname, exist_ok=True)
132 | save_file(loaded, sf_filename, metadata={"format": "pt"})
133 | check_file_size(sf_filename, pt_filename)
134 | reloaded = load_file(sf_filename)
135 | for k in loaded:
136 | pt_tensor = loaded[k]
137 | sf_tensor = reloaded[k]
138 | if not torch.equal(pt_tensor, sf_tensor):
139 | raise RuntimeError(f"The output tensors do not match for key {k}")
140 |
141 |
142 | def create_diff(pt_infos: Dict[str, List[str]], sf_infos: Dict[str, List[str]]) -> str:
143 | errors = []
144 | for key in ["missing_keys", "mismatched_keys", "unexpected_keys"]:
145 | pt_set = set(pt_infos[key])
146 | sf_set = set(sf_infos[key])
147 |
148 | pt_only = pt_set - sf_set
149 | sf_only = sf_set - pt_set
150 |
151 | if pt_only:
152 | errors.append(f"{key} : PT warnings contain {pt_only} which are not present in SF warnings")
153 | if sf_only:
154 | errors.append(f"{key} : SF warnings contain {sf_only} which are not present in PT warnings")
155 | return "\n".join(errors)
156 |
157 |
158 | def check_final_model(model_id: str, folder: str, token: Optional[str]):
159 | config = hf_hub_download(repo_id=model_id, filename="config.json", token=token, cache_dir=folder)
160 | shutil.copy(config, os.path.join(folder, "config.json"))
161 | config = AutoConfig.from_pretrained(folder)
162 |
163 | import transformers
164 |
165 | class_ = getattr(transformers, config.architectures[0])
166 | with torch.device("meta"):
167 | (pt_model, pt_infos) = class_.from_pretrained(folder, output_loading_info=True)
168 | (sf_model, sf_infos) = class_.from_pretrained(folder, output_loading_info=True)
169 |
170 | if pt_infos != sf_infos:
171 | error_string = create_diff(pt_infos, sf_infos)
172 | raise ValueError(f"Different infos when reloading the model: {error_string}")
173 |
174 | #### XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
175 | #### SKIPPING THE REST OF THE test to save RAM
176 | return
177 | pt_params = pt_model.state_dict()
178 | sf_params = sf_model.state_dict()
179 |
180 | pt_shared = shared_pointers(pt_params)
181 | sf_shared = shared_pointers(sf_params)
182 | if pt_shared != sf_shared:
183 | raise RuntimeError("The reconstructed model is wrong, shared tensors are different {shared_pt} != {shared_tf}")
184 |
185 | sig = signature(pt_model.forward)
186 | input_ids = torch.arange(10).unsqueeze(0)
187 | pixel_values = torch.randn(1, 3, 224, 224)
188 | input_values = torch.arange(1000).float().unsqueeze(0)
189 | # Hardcoded for whisper basically
190 | input_features = torch.zeros((1, 80, 3000))
191 | kwargs = {}
192 | if "input_ids" in sig.parameters:
193 | kwargs["input_ids"] = input_ids
194 | if "input_features" in sig.parameters:
195 | kwargs["input_features"] = input_features
196 | if "decoder_input_ids" in sig.parameters:
197 | kwargs["decoder_input_ids"] = input_ids
198 | if "pixel_values" in sig.parameters:
199 | kwargs["pixel_values"] = pixel_values
200 | if "input_values" in sig.parameters:
201 | kwargs["input_values"] = input_values
202 | if "bbox" in sig.parameters:
203 | kwargs["bbox"] = torch.zeros((1, 10, 4)).long()
204 | if "image" in sig.parameters:
205 | kwargs["image"] = pixel_values
206 |
207 | if torch.cuda.is_available():
208 | pt_model = pt_model.cuda()
209 | sf_model = sf_model.cuda()
210 | kwargs = {k: v.cuda() for k, v in kwargs.items()}
211 |
212 | try:
213 | pt_logits = pt_model(**kwargs)[0]
214 | except Exception as e:
215 | try:
216 | # Musicgen special exception.
217 | decoder_input_ids = torch.ones((input_ids.shape[0] * pt_model.decoder.num_codebooks, 1), dtype=torch.long)
218 | if torch.cuda.is_available():
219 | decoder_input_ids = decoder_input_ids.cuda()
220 |
221 | kwargs["decoder_input_ids"] = decoder_input_ids
222 | pt_logits = pt_model(**kwargs)[0]
223 | except Exception:
224 | raise e
225 | sf_logits = sf_model(**kwargs)[0]
226 |
227 | torch.testing.assert_close(sf_logits, pt_logits)
228 | print(f"Model {model_id} is ok !")
229 |
230 |
231 | def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
232 | try:
233 | main_commit = api.list_repo_commits(model_id)[0].commit_id
234 | discussions = api.get_repo_discussions(repo_id=model_id)
235 | except Exception:
236 | return None
237 | for discussion in discussions:
238 | if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
239 | commits = api.list_repo_commits(model_id, revision=discussion.git_reference)
240 |
241 | if main_commit == commits[1].commit_id:
242 | return discussion
243 | return None
244 |
245 |
246 | def convert_generic(model_id: str, folder: str, filenames: Set[str], token: Optional[str]) -> ConversionResult:
247 | operations = []
248 | errors = []
249 |
250 | extensions = set([".bin", ".ckpt"])
251 | for filename in filenames:
252 | prefix, ext = os.path.splitext(filename)
253 | if ext in extensions:
254 | pt_filename = hf_hub_download(model_id, filename=filename, token=token, cache_dir=folder)
255 | dirname, raw_filename = os.path.split(filename)
256 | if raw_filename == "pytorch_model.bin":
257 | # XXX: This is a special case to handle `transformers` and the
258 | # `transformers` part of the model which is actually loaded by `transformers`.
259 | sf_in_repo = os.path.join(dirname, "model.safetensors")
260 | else:
261 | sf_in_repo = f"{prefix}.safetensors"
262 | sf_filename = os.path.join(folder, sf_in_repo)
263 | try:
264 | convert_file(pt_filename, sf_filename)
265 | operations.append(CommitOperationAdd(path_in_repo=sf_in_repo, path_or_fileobj=sf_filename))
266 | except Exception as e:
267 | errors.append((pt_filename, e))
268 | return operations, errors
269 |
270 |
271 | def convert(api: "HfApi", model_id: str, force: bool = False) -> Tuple["CommitInfo", List[Tuple[str, "Exception"]]]:
272 | pr_title = "Adding `safetensors` variant of this model"
273 | info = api.model_info(model_id)
274 | filenames = set(s.rfilename for s in info.siblings)
275 |
276 | with TemporaryDirectory() as d:
277 | folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
278 | os.makedirs(folder)
279 | new_pr = None
280 | try:
281 | operations = None
282 | pr = previous_pr(api, model_id, pr_title)
283 |
284 | library_name = getattr(info, "library_name", None)
285 | if any(filename.endswith(".safetensors") for filename in filenames) and not force:
286 | raise AlreadyExists(f"Model {model_id} is already converted, skipping..")
287 | elif pr is not None and not force:
288 | url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
289 | new_pr = pr
290 | raise AlreadyExists(f"Model {model_id} already has an open PR check out {url}")
291 | elif library_name == "transformers":
292 | if "pytorch_model.bin" in filenames:
293 | operations, errors = convert_single(model_id, folder, token=api.token)
294 | elif "pytorch_model.bin.index.json" in filenames:
295 | operations, errors = convert_multi(model_id, folder, token=api.token)
296 | else:
297 | raise RuntimeError(f"Model {model_id} doesn't seem to be a valid pytorch model. Cannot convert")
298 | check_final_model(model_id, folder, token=api.token)
299 | else:
300 | operations, errors = convert_generic(model_id, folder, filenames, token=api.token)
301 |
302 | if operations:
303 | new_pr = api.create_commit(
304 | repo_id=model_id,
305 | operations=operations,
306 | commit_message=pr_title,
307 | commit_description=COMMIT_DESCRIPTION,
308 | create_pr=True,
309 | )
310 | print(f"Pr created at {new_pr.pr_url}")
311 | else:
312 | print("No files to convert")
313 | finally:
314 | shutil.rmtree(folder)
315 | return new_pr, errors
316 |
317 |
318 | if __name__ == "__main__":
319 | DESCRIPTION = """
320 | Simple utility tool to convert automatically some weights on the hub to `safetensors` format.
321 | It is PyTorch exclusive for now.
322 | It works by downloading the weights (PT), converting them locally, and uploading them back
323 | as a PR on the hub.
324 | """
325 | parser = argparse.ArgumentParser(description=DESCRIPTION)
326 | parser.add_argument(
327 | "model_id",
328 | type=str,
329 | help="The name of the model on the hub to convert. E.g. `gpt2` or `facebook/wav2vec2-base-960h`",
330 | )
331 | parser.add_argument(
332 | "--force",
333 | action="store_true",
334 | help="Create the PR even if it already exists of if the model was already converted.",
335 | )
336 | parser.add_argument(
337 | "-y",
338 | action="store_true",
339 | help="Ignore safety prompt",
340 | )
341 | args = parser.parse_args()
342 | model_id = args.model_id
343 | api = HfApi()
344 | if args.y:
345 | txt = "y"
346 | else:
347 | txt = input(
348 | "This conversion script will unpickle a pickled file, which is inherently unsafe. If you do not trust this file, we invite you to use"
349 | " https://huggingface.co/spaces/safetensors/convert or google colab or other hosted solution to avoid potential issues with this file."
350 | " Continue [Y/n] ?"
351 | )
352 | if txt.lower() in {"", "y"}:
353 | try:
354 | commit_info, errors = convert(api, model_id, force=args.force)
355 | string = f"""
356 | ### Success 🔥
357 | Yay! This model was successfully converted and a PR was open using your token, here:
358 | [{commit_info.pr_url}]({commit_info.pr_url})
359 | """
360 | if errors:
361 | string += "\nErrors during conversion:\n"
362 | string += "\n".join(
363 | f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors
364 | )
365 | print(string)
366 | except Exception as e:
367 | print(
368 | f"""
369 | ### Error 😢😢😢
370 |
371 | {e}
372 | """
373 | )
374 | else:
375 | print(f"Answer was `{txt}` aborting.")
--------------------------------------------------------------------------------
/gem5/pytest_simulator.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from gem5 import simulator
3 | from gem5.simulator import PieEnvironment, PieSingleResult, PiePairResult, make
4 | import numpy as np
5 | from collections import defaultdict
6 | from pprint import pprint
7 |
8 | API_KEY="cdZ5TynkL5D7gCTFvzJT4YKu05aozTLp4GgIcK5"
9 |
10 | example_1_code = """
11 | #include
12 | #define REP(i, n) for (int i = 0; i < (n); i++)
13 | using namespace std;
14 | const int MOD = 998244353;
15 |
16 | int main() {
17 | cin.tie(0)->sync_with_stdio(false);
18 |
19 | int n, k; cin >> n >> k;
20 | vector l(k), r(k);
21 | REP(i, k) cin >> l[i] >> r[i];
22 | REP(i, k) r[i]++;
23 |
24 | vector dp(n + 1, 0);
25 | dp[0] = 1;
26 | dp[1] = -1;
27 | REP(i, n) {
28 | if (i > 0)
29 | dp[i] = (dp[i] + dp[i - 1]) % MOD;
30 | REP(j, k) {
31 | if (i + l[j] < n)
32 | dp[i + l[j]] = (dp[i + l[j]] + dp[i]) % MOD;
33 | if (i + r[j] < n)
34 | dp[i + r[j]] = (((dp[i + r[j]] - dp[i]) % MOD) + MOD) % MOD;
35 | }
36 | }
37 | cout << dp[n - 1] << endl;
38 | return 0;
39 | }
40 | """
41 | example_1_problem_id = "p02549"
42 |
43 | example_2_code = """
44 | #include
45 | #include
46 | typedef long long ll;
47 | typedef unsigned int ui;
48 | #define infin (ll)(998244353)
49 | using namespace std;
50 | int main()
51 | {
52 | int n,k;
53 | cin>>n>>k;
54 | int l,r;
55 | vector dp(n+1,0); //0 to n
56 | vector >v;
57 | for(int j=0;j>l>>r;
60 | v.push_back({l,r});
61 | }
62 | dp[0]=1;;
63 | dp[1]=1;
64 | sort(v.begin(),v.end());
65 | auto z=v.begin();
66 | if ((*z).first==1)
67 | dp[2]=1;
68 | else
69 | dp[2]=0;
70 | for(int i=3;i<=n;i++)
71 | {
72 | dp[i]=dp[i-1];
73 | for (auto x:v)
74 | {
75 | if (i>x.first)
76 | dp[i]+=dp[i-x.first];
77 | else
78 | break;
79 | if (i-1>x.second)
80 | {
81 | dp[i]-=dp[i-1-x.second];
82 | if (dp[i]<0)
83 | dp[i]+=infin;
84 | }
85 | }
86 | dp[i]=(dp[i]) % infin;
87 | }
88 | cout< 0.95
129 | assert result.mean_acc_v1 > 0.95
130 |
131 | pprint(result.tc2time_v0)
132 | pprint(result.tc2time_v1)
133 |
134 | print(
135 | f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
136 | print(
137 | f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
138 | print(
139 | f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001026564396")
140 | print(
141 | f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001029346032")
142 |
143 | assert result.tc2time_v0[0] == 0.001035073468
144 | assert result.tc2time_v0[1] == 0.001039205596
145 |
146 | assert result.tc2time_v1[0] == 0.001026564396
147 | assert result.tc2time_v1[1] == 0.001029346032
148 |
149 | hyperfine_v0_tc2stats = result.tc2stats_binary_v0
150 | hyperfine_v1_tc2stats = result.tc2stats_binary_v1
151 |
152 | for tc, time in hyperfine_v0_tc2stats.items():
153 | tc2hyperfine_v0[tc].append(np.array(time))
154 | for tc, time in hyperfine_v1_tc2stats.items():
155 | tc2hyperfine_v1[tc].append(np.array(time))
156 |
157 | for tc, times_v0 in tc2hyperfine_v0.items():
158 | mean_times_v0 = []
159 | for time_list in times_v0:
160 | mean_times_v0.append(np.mean(time_list))
161 | mean_times_v1 = []
162 | for time_list in tc2hyperfine_v1[tc]:
163 | mean_times_v1.append(np.mean(time_list))
164 | # consistency check
165 | assert (np.std(mean_times_v0) / np.mean(mean_times_v0)
166 | ) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
167 | assert (np.std(mean_times_v1) / np.mean(mean_times_v1)
168 | ) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
169 | # performance check
170 | assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)
171 | ) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
172 | print(
173 | f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
174 | print(
175 | f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
176 | print(
177 | f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
178 |
179 | assert len(tc2hyperfine_v0) == 2
180 | assert len(tc2hyperfine_v1) == 2
181 |
182 |
183 | def test_single_submission(self, get_pie_env):
184 | env = get_pie_env
185 | tc2hyperfine = defaultdict(list)
186 | for _ in range(2):
187 | result = env.submit_single_submission(code=example_1_code,
188 | testcases=[0,1],
189 | problem_id=example_1_problem_id,
190 | timing_env="both")
191 |
192 | assert result.compilation == True
193 | assert result.tc2success[0] == True
194 | assert result.tc2success[1] == True
195 | assert result.tc2time[0] == 0.001035073468
196 | assert result.tc2time[1] == 0.001039205596
197 | assert result.mean_acc > 0.95
198 |
199 | hyperfine_result = result.tc2stats_binary
200 |
201 | for tc, results in hyperfine_result.items():
202 | tc2hyperfine[tc].append(np.array(results))
203 |
204 | for tc, times in tc2hyperfine.items():
205 | mean_times = []
206 | for time_list in times:
207 | mean_times.append(np.mean(time_list))
208 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
209 | assert len(tc2hyperfine) == 2
210 |
211 |
212 |
213 | def test_dual_submission_diff_code(self, get_pie_env):
214 | env = get_pie_env
215 | tc2hyperfine_v0 = defaultdict(list)
216 | tc2hyperfine_v1 = defaultdict(list)
217 | for _ in range(2):
218 | result = env.submit_single_submission_pair(code_v0=example_1_code,
219 | code_v1=example_2_code,
220 | testcases=[0,1],
221 | problem_id=example_1_problem_id,
222 | timing_env="both")
223 |
224 |
225 | assert result.compilation_v0 == True
226 | assert result.compilation_v1 == True
227 |
228 | assert result.mean_acc_v0 > 0.95
229 | assert result.mean_acc_v1 > 0.95
230 |
231 | pprint(result.tc2time_v0)
232 | pprint(result.tc2time_v1)
233 |
234 | print(f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
235 | print(f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
236 | print(f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001026564396")
237 | print(f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001029346032")
238 |
239 | assert result.tc2time_v0[0] == 0.001035073468
240 | assert result.tc2time_v0[1] == 0.001039205596
241 |
242 | assert result.tc2time_v1[0] == 0.001026564396
243 | assert result.tc2time_v1[1] == 0.001029346032
244 |
245 | hyperfine_v0_tc2stats = result.tc2stats_binary_v0
246 | hyperfine_v1_tc2stats = result.tc2stats_binary_v1
247 |
248 | for tc, time in hyperfine_v0_tc2stats.items():
249 | tc2hyperfine_v0[tc].append(np.array(time))
250 | for tc, time in hyperfine_v1_tc2stats.items():
251 | tc2hyperfine_v1[tc].append(np.array(time))
252 |
253 | for tc, times_v0 in tc2hyperfine_v0.items():
254 | mean_times_v0 = []
255 | for time_list in times_v0 :
256 | mean_times_v0.append(np.mean(time_list))
257 | mean_times_v1 = []
258 | for time_list in tc2hyperfine_v1[tc] :
259 | mean_times_v1.append(np.mean(time_list))
260 | # consistency check
261 | assert (np.std(mean_times_v0) / np.mean(mean_times_v0)) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
262 | assert (np.std(mean_times_v1) / np.mean(mean_times_v1)) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
263 | # performance check
264 | assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
265 | print(f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
266 | print(f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
267 | print(f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
268 |
269 | assert len(tc2hyperfine_v0) == 2
270 | assert len(tc2hyperfine_v1) == 2
271 |
272 |
273 | def test_dual_submission_same_code(self, get_pie_env):
274 | env = get_pie_env
275 | tc2hyperfine_v0 = defaultdict(list)
276 | tc2hyperfine_v1 = defaultdict(list)
277 | for _ in range(2):
278 | result = env.submit_single_submission_pair(code_v0=example_1_code,
279 | code_v1=example_1_code,
280 | testcases=[0,1],
281 | problem_id=example_1_problem_id,
282 | timing_env="both")
283 |
284 |
285 | assert result.compilation_v0 == True
286 | assert result.compilation_v1 == True
287 |
288 | assert result.mean_acc_v0 > 0.95
289 | assert result.mean_acc_v1 > 0.95
290 |
291 | pprint(result.tc2time_v0)
292 | pprint(result.tc2time_v1)
293 |
294 | print(f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
295 | print(f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
296 | print(f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001035073468")
297 | print(f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001039205596")
298 |
299 | assert result.tc2time_v0[0] == 0.001035073468
300 | assert result.tc2time_v0[1] == 0.001039205596
301 |
302 | assert result.tc2time_v1[0] == 0.001035073468
303 | assert result.tc2time_v1[1] == 0.001039205596
304 |
305 | hyperfine_v0_tc2stats = result.tc2stats_binary_v0
306 | hyperfine_v1_tc2stats = result.tc2stats_binary_v1
307 |
308 | for tc, time in hyperfine_v0_tc2stats.items():
309 | tc2hyperfine_v0[tc].append(np.array(time))
310 | for tc, time in hyperfine_v1_tc2stats.items():
311 | tc2hyperfine_v1[tc].append(np.array(time))
312 |
313 | for tc, times_v0 in tc2hyperfine_v0.items():
314 | times_v1 = tc2hyperfine_v1[tc]
315 | mean_times = []
316 | for time_list in times_v0 + times_v1:
317 | mean_times.append(np.mean(time_list))
318 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
319 | print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
320 | assert len(tc2hyperfine_v0) == 2
321 |
322 |
323 |
324 | def test_multiple_single_submissions(self, get_pie_env):
325 |
326 |
327 | code_list = [example_1_code, example_2_code] * 3
328 | testcases_list = [[0, 1], [0, 1]] * 3
329 | problem_id_list = [example_1_problem_id, example_2_problem_id] * 3
330 | override_flags_list = ["", ""] * 3
331 |
332 | env = get_pie_env
333 |
334 | results = env.submit_multiple_single_submissions(code_list=code_list,
335 | testcases_list=testcases_list,
336 | problem_id_list=problem_id_list,
337 | override_flags_list=override_flags_list,
338 | timing_env="both")
339 |
340 | tc2hyperfine_v0 = defaultdict(list)
341 | tc2hyperfine_v1 = defaultdict(list)
342 |
343 | for i, result in enumerate(results):
344 | assert result.compilation == True
345 | assert result.tc2success[0] == True
346 | assert result.tc2success[1] == True
347 |
348 | hyperfine_result = result.tc2stats_binary
349 |
350 | if (i % 2) == 0:
351 | assert result.tc2time[0] == 0.001035073468
352 | assert result.tc2time[1] == 0.001039205596
353 | tc2hyperfine = tc2hyperfine_v0
354 | else:
355 | assert result.tc2time[0] == 0.001026564396
356 | assert result.tc2time[1] == 0.001029346032
357 | tc2hyperfine = tc2hyperfine_v1
358 |
359 | for tc, results in hyperfine_result.items():
360 | tc2hyperfine[tc].append(np.array(results))
361 |
362 | for tc, times_v0 in tc2hyperfine_v0.items():
363 | mean_times_v0 = []
364 | for time_list in times_v0 :
365 | mean_times_v0.append(np.mean(time_list))
366 | mean_times_v1 = []
367 | for time_list in tc2hyperfine_v1[tc] :
368 | mean_times_v1.append(np.mean(time_list))
369 |
370 | print(f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
371 | print(f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
372 | print(f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
373 |
374 | # consistency check
375 | assert (np.std(mean_times_v0) / np.mean(mean_times_v0)) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
376 | assert (np.std(mean_times_v1) / np.mean(mean_times_v1)) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
377 | # performance check
378 | assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
379 |
380 |
381 | assert len(tc2hyperfine_v0) == 2
382 | assert len(tc2hyperfine_v1) == 2
--------------------------------------------------------------------------------
/gem5/gem5_api.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, jsonify
2 | import argparse
3 | import json
4 | import logging
5 | from datetime import datetime
6 | import os
7 | from joblib import Parallel, delayed
8 | import benchmarking
9 | import tempfile
10 | import multiprocessing
11 | import numpy as np
12 | import joblib
13 | from tqdm import tqdm
14 | import contextlib
15 |
16 | LOGGING_DIR="/home/logs/"
17 | if not os.path.exists(LOGGING_DIR):
18 | os.makedirs(LOGGING_DIR)
19 |
20 |
21 | logger = logging.getLogger(__name__)
22 | logger.setLevel(logging.CRITICAL)
23 |
24 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')
25 |
26 | # Create a file handler for the log file
27 | start_date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
28 | file_handler = logging.FileHandler(os.path.join(LOGGING_DIR, start_date_time + "_gem5_api.log"))
29 | file_handler.setLevel(logging.DEBUG)
30 | file_handler.setFormatter(formatter)
31 |
32 | # Create a stream handler to print the logs to stdout
33 | stream_handler = logging.StreamHandler()
34 | stream_handler.setLevel(logging.INFO)
35 | stream_handler.setFormatter(formatter)
36 |
37 | # Add both handlers to the logger
38 | logger.addHandler(file_handler)
39 | logger.addHandler(stream_handler)
40 |
41 |
42 | app = Flask(__name__)
43 |
44 |
45 | global MANAGER
46 | global QUEUE
47 | global N_CPUS
48 | MANAGER = ...
49 | QUEUE = ...
50 | N_CPUS=... # Will be set in init_globals after parse_args()
51 |
52 | @contextlib.contextmanager
53 | def tqdm_joblib(tqdm_object):
54 | """Context manager to patch joblib to report into tqdm progress bar given as argument"""
55 | class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
56 | def __call__(self, *args, **kwargs):
57 | tqdm_object.update(n=self.batch_size)
58 | return super().__call__(*args, **kwargs)
59 |
60 | old_batch_callback = joblib.parallel.BatchCompletionCallBack
61 | joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
62 | try:
63 | yield tqdm_object
64 | finally:
65 | joblib.parallel.BatchCompletionCallBack = old_batch_callback
66 | tqdm_object.close()
67 |
68 |
69 | def init_globals(n_workers: int = -1, use_logical_cpus: bool = False):
70 | global MANAGER
71 | global QUEUE
72 | global N_CPUS
73 |
74 | MANAGER = multiprocessing.Manager()
75 | QUEUE = MANAGER.Queue()
76 | if use_logical_cpus:
77 | cpu_list = benchmarking.add_logicial_cpus_to_queue(n_workers, QUEUE)
78 | else:
79 | cpu_list = benchmarking.add_physical_cpus_to_queue(n_workers, QUEUE)
80 | N_CPUS = len(cpu_list)
81 | print(f"Initialized globals with {N_CPUS} cpus")
82 | return None
83 |
84 |
85 | def parse_args():
86 | parser = argparse.ArgumentParser(description='Gem5 API')
87 | parser.add_argument('--api_key', type=str, help='required API key on initialization for authentication')
88 | parser.add_argument('--port', type=int, default=706965, help='port number')
89 | parser.add_argument('--working_dir', type=str, default='/home/working_dir', help='working directory')
90 | parser.add_argument('--use_logical_cpus', default=False, action="store_true")
91 | parser.add_argument('--workers', type=int, default=-1, help='number of workers, if <0 (e.g. -1) then it uses all available physical cpus')
92 | parser.add_argument('--threaded', default=False, action="store_true")
93 | parser.add_argument('--gem5_acc_threshold', type=float, default=0.95, help="mean threshold where if below this, we do not run gem5")
94 | parser.add_argument('--debug', default=False, action="store_true")
95 | parser.add_argument('--exit_early_on_fail', action="store_true")
96 | ## gem5 and compilation parameters
97 | parser.add_argument('--testcases_dir', type=str, help='testcases directory', default="/home/pie-perf/data/codenet/merged_test_cases/")
98 | parser.add_argument('--cstd', type=str, help='cstd', default='--std=c++17')
99 | parser.add_argument('--optimization_flag', type=str, help='optimization', default='-O3')
100 | parser.add_argument('--gem5_dir', type=str, help='path containing gem5 binary and build', default='/home/gem5/build/X86/')
101 | parser.add_argument('--gem5_script_path', type=str, help='path to gem5 script', default='/home/gem5-skylake-config/gem5-configs/run-se.py')
102 | parser.add_argument('--cpu_type', type=str, help='cpu type', default='Verbatim')
103 | parser.add_argument('--path_to_atcoder', type=str, help='path to atcoder', default='/home/ac-library/')
104 | parser.add_argument('--timeout_seconds_binary', type=int, help='timeout seconds for binary', default=10)
105 | parser.add_argument('--timeout_seconds_gem5', type=int, help='timeout seconds for gem5', default=120)
106 |
107 |
108 | args = parser.parse_args()
109 | app.config.update(vars(args))
110 | return args
111 |
112 | def single_submission(code, testcases, problem_id, timing_env, queue, override_flags=""):
113 | ## TODO -> check if any test cases are missing with hyperfine
114 | logging.info(f"single_submission for problem {problem_id} with timing_env {timing_env} and testcases {testcases}")
115 | override_flags = "" if not isinstance(override_flags, str) else override_flags
116 | result = {}
117 | cpu_number = queue.get(block=True) if timing_env in ("binary", "both") else None
118 | logging.info(f"got cpu {cpu_number} in pid {os.getpid()}")
119 | with tempfile.TemporaryDirectory() as tmpdirname:
120 | code_path = os.path.join(tmpdirname, 'code.cpp')
121 | with open(code_path, 'w') as f:
122 | f.write(code)
123 | print(f"app cfg cstd {app.config['cstd']} app.config['optimization_flag']: {app.config['optimization_flag']} override_flags: {override_flags }")
124 | cflags = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags
125 | bin_path, accs = benchmarking.compile_and_check_outputs(
126 | code_path=code_path,
127 | problem_id=problem_id,
128 | testcases_dir=app.config['testcases_dir'],
129 | timeout=app.config['timeout_seconds_binary'],
130 | cflags=cflags,
131 | testcases=testcases,
132 | cpu_number=cpu_number)
133 | result["compile_success"] = bin_path is not None
134 | result['accs'] = accs
135 | mean_accs = np.mean(list(accs.values()))
136 | logging.info(f"mean_accs: {mean_accs}")
137 | if mean_accs < app.config["gem5_acc_threshold"]:
138 | logging.info(f"mean_accs: {mean_accs} is below threshold {app.config['gem5_acc_threshold']}, skipping gem5")
139 | if timing_env in ["gem5", "both"]:
140 | result["gem5"] = {} # return empty dict
141 | if timing_env in ["binary", "both"]:
142 | result["binary"] = {} # return empty dict
143 | return result
144 |
145 | if timing_env in ['gem5', 'both']:
146 | logging.info(f"running gem5 for problem {problem_id}")
147 | gem5_results = benchmarking.run_gem5(
148 | gem5_dir=app.config['gem5_dir'],
149 | gem5_script_path=app.config['gem5_script_path'],
150 | cpu_type=app.config['cpu_type'],
151 | bin_path=bin_path,
152 | problem_id=problem_id,
153 | testcases_dir=app.config['testcases_dir'],
154 | timeout=app.config['timeout_seconds_gem5'],
155 | testcases=testcases,
156 | cpu_number=cpu_number,
157 | exit_early_on_fail=app.config['exit_early_on_fail'])
158 | result['gem5'] = gem5_results
159 | if timing_env in ['binary', 'both']:
160 | code2results, output = benchmarking.run_hyperfine(
161 | code_paths=[code_path],
162 | problem_ids=[problem_id],
163 | path_to_testcases=app.config['testcases_dir'],
164 | # TODO: REMOVE THIS HERE
165 | json_out_path=os.path.join(tmpdirname, 'hyperfine_results.json'),
166 | test_cases_list=[testcases],
167 | min_runs_per_test_case=10,
168 | max_runs_per_test_case=500,
169 | warmup_runs_per_test_case=5,
170 | cpu_number=cpu_number,
171 | do_sanity_check=True) # TODO: PIN TO CPU
172 | binary_results = code2results[code_path]
173 | result["binary"] = binary_results
174 | queue.put(cpu_number)
175 | return result
176 |
177 |
178 | def dual_submission(code_v0, code_v1, testcases, problem_id, timing_env, queue, override_flags_v0="", override_flags_v1=""):
179 | override_flags_v0 = "" if not isinstance(override_flags_v0, str) else override_flags_v0
180 | override_flags_v1 = "" if not isinstance(override_flags_v1, str) else override_flags_v1
181 | result = {}
182 | cpu_number = queue.get(block=True)
183 | with tempfile.TemporaryDirectory() as tmpdirname_v0, tempfile.TemporaryDirectory() as tmpdirname_v1:
184 | code_path_v0 = os.path.join(tmpdirname_v0, 'code.cpp')
185 | with open(code_path_v0, 'w') as f:
186 | f.write(code_v0)
187 | code_path_v1 = os.path.join(tmpdirname_v1, 'code.cpp')
188 | with open(code_path_v1, 'w') as f:
189 | f.write(code_v1)
190 |
191 | print(f"app cfg cstd {app.config['cstd']} app.config['optimization_flag']: {app.config['optimization_flag']} override_flags_v0: {override_flags_v0 }")
192 | cflags_v0 = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags_v0
193 | cflags_v1 = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags_v1
194 |
195 | bin_path_v0, accs_v0 = benchmarking.compile_and_check_outputs(
196 | code_path=code_path_v0,
197 | problem_id=problem_id,
198 | testcases_dir=app.config['testcases_dir'],
199 | timeout=app.config['timeout_seconds_binary'],
200 | cflags=cflags_v0,
201 | testcases=testcases,
202 | cpu_number=cpu_number)
203 | bin_path_v1, accs_v1 = benchmarking.compile_and_check_outputs(
204 | code_path=code_path_v1,
205 | problem_id=problem_id,
206 | testcases_dir=app.config['testcases_dir'],
207 | timeout=app.config['timeout_seconds_binary'],
208 | cflags=cflags_v1,
209 | testcases=testcases,
210 | cpu_number=cpu_number)
211 | result["compile_success_v0"] = bin_path_v0 is not None
212 | result["compile_success_v1"] = bin_path_v1 is not None
213 | result['accs_v0'] = accs_v0
214 | result['accs_v1'] = accs_v1
215 | if timing_env in ['gem5', 'both']:
216 | gem5_results_v0 = benchmarking.run_gem5(
217 | gem5_dir=app.config['gem5_dir'],
218 | gem5_script_path=app.config['gem5_script_path'],
219 | cpu_type=app.config['cpu_type'],
220 | bin_path=bin_path_v0,
221 | problem_id=problem_id,
222 | testcases_dir=app.config['testcases_dir'],
223 | timeout=app.config['timeout_seconds_gem5'],
224 | testcases=testcases,
225 | cpu_number=cpu_number,
226 | exit_early_on_fail=app.config['exit_early_on_fail'])
227 | result['gem5_v0'] = gem5_results_v0
228 | gem5_results_v1 = benchmarking.run_gem5(
229 | gem5_dir=app.config['gem5_dir'],
230 | gem5_script_path=app.config['gem5_script_path'],
231 | cpu_type=app.config['cpu_type'],
232 | bin_path=bin_path_v1,
233 | problem_id=problem_id,
234 | testcases_dir=app.config['testcases_dir'],
235 | timeout=app.config['timeout_seconds_gem5'],
236 | testcases=testcases,
237 | cpu_number=cpu_number,
238 | exit_early_on_fail=app.config['exit_early_on_fail'])
239 | result['gem5_v1'] = gem5_results_v1
240 | if timing_env in ['binary', 'both']:
241 | code2results, output = benchmarking.run_hyperfine(
242 | code_paths=[code_path_v0, code_path_v1],
243 | problem_ids=[problem_id, problem_id],
244 | path_to_testcases=app.config['testcases_dir'],
245 | json_out_path=os.path.join(tmpdirname_v0, 'hyperfine_results.json'),
246 | test_cases_list=[testcases, testcases],
247 | min_runs_per_test_case=10,
248 | max_runs_per_test_case=500,
249 | warmup_runs_per_test_case=5,
250 | cpu_number=cpu_number,
251 | do_sanity_check=True)
252 | result["binary_v0"] = code2results[code_path_v0]
253 | result["binary_v1"] = code2results[code_path_v1]
254 | queue.put(cpu_number)
255 | return result
256 |
257 |
258 | def multiple_single_submissions(code_list, testcases_list, problem_id_list, timing_env, queue, cpus, override_flags_list=None):
259 | assert len(code_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list)
260 | with tqdm_joblib(tqdm(desc="Running multiple single submissions", total=len(code_list))) as progress_bar:
261 | results = Parallel(n_jobs=cpus, verbose=10, backend="multiprocessing")(delayed(single_submission)(code, testcases, problem_id, timing_env, queue, override_flags) for code, testcases, problem_id, override_flags in zip(code_list, testcases_list, problem_id_list, override_flags_list))
262 | return results
263 |
264 | def multiple_dual_submissions(code_v0_list, code_v1_list, testcases_list, problem_id_list, timing_env, queue, cpus, override_flags_list_v0, override_flags_list_v1):
265 | assert len(code_v0_list) == len(code_v1_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list_v0) == len(override_flags_list_v1)
266 | results = Parallel(n_jobs=cpus, verbose=10, backend="multiprocessing")(delayed(dual_submission)(code_v0, code_v1, testcases, problem_id, timing_env, queue, override_flags_v0, override_flags_v1) for code_v0, code_v1, testcases, problem_id, override_flags_v0, override_flags_v1 in zip(code_v0_list, code_v1_list, testcases_list, problem_id_list, override_flags_list_v0, override_flags_list_v1))
267 | return results
268 |
269 |
270 | @app.route('/gem5/single_submission', methods=['GET'])
271 | def SingleSubmission():
272 | req = request.get_json()
273 | if req["api_key"] != app.config["api_key"]:
274 | return jsonify({"error": "Invalid API key"})
275 | code = req['code']
276 | testcases = req['testcases']
277 | problem_id = req['problem_id']
278 | timing_env = req['timing_env']
279 | assert len(testcases) > 0
280 | assert len(code) > 0
281 | assert timing_env in ['gem5', 'binary', 'both']
282 |
283 | override_flags = req.get('override_flags', "")
284 | results = single_submission(code, testcases, problem_id, timing_env, QUEUE, override_flags)
285 | return jsonify(results)
286 |
287 | @app.route('/gem5/multiple_single_submissions', methods=['GET'])
288 | def MultipleSubmissions():
289 | req = request.get_json()
290 | if req["api_key"] != app.config["api_key"]:
291 | return jsonify({"error": "Invalid API key"})
292 | submissions = req['submissions']
293 | timing_env = req['timing_env']
294 | code_list = [r['code'] for r in submissions]
295 | testcases_list = [r['testcases'] for r in submissions]
296 | problem_id_list = [r['problem_id'] for r in submissions]
297 | override_flags_list = [r.get('override_flags_list', "") for r in submissions]
298 |
299 | assert len(code_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list)
300 | assert timing_env in ['gem5', 'binary', 'both']
301 | assert len(code_list) > 0
302 | assert len(testcases_list) > 0
303 | assert len(problem_id_list) > 0
304 | assert len(override_flags_list) > 0
305 | assert all([len(code) > 0 for code in code_list])
306 | assert all([len(testcases) > 0 for testcases in testcases_list])
307 |
308 | results = multiple_single_submissions(code_list, testcases_list, problem_id_list, timing_env, QUEUE, N_CPUS, override_flags_list)
309 |
310 | return jsonify(results)
311 |
312 | @app.route('/gem5/single_submission_pair', methods=['GET'])
313 | def SingleSubmissionPair():
314 | req = request.get_json()
315 | if req["api_key"] != app.config["api_key"]:
316 | return jsonify({"error": "Invalid API key"})
317 | #assert len(req) == 2
318 | code_v0 = req['code_v0']
319 | code_v1 = req['code_v1']
320 | testcases = req['testcases']
321 | problem_id = req['problem_id']
322 | timing_env = req['timing_env']
323 | assert len(testcases) > 0
324 | assert len(code_v0) > 0
325 | assert len(code_v1) > 0
326 | assert timing_env in ['gem5', 'binary', 'both']
327 |
328 | override_flags = req.get('override_flags', "")
329 | results = dual_submission(code_v0, code_v1, testcases, problem_id, timing_env, QUEUE, override_flags)
330 | return jsonify(results)
331 |
332 | @app.route('/gem5/multiple_submissions_pairs', methods=['GET'])
333 | def MultipleSubmissionsPair():
334 | req = request.get_json()
335 | if req["api_key"] != app.config["api_key"]:
336 | return jsonify({"error": "Invalid API key"})
337 | submissions_v0 = req['submissions_v0']
338 | submissions_v1 = req['submissions_v1']
339 | timing_env = req['timing_env']
340 |
341 | code_list_v0 = [r['code'] for r in submissions_v0]
342 | code_list_v1 = [r['code'] for r in submissions_v1]
343 | testcases_list = [r['testcases'] for r in submissions_v0]
344 | problem_id_list = [r['problem_id'] for r in submissions_v0]
345 |
346 | override_flags_list_v0 = [r.get('override_flags_list', "") for r in submissions_v0]
347 | override_flags_list_v1 = [r.get('override_flags_list', "") for r in submissions_v1]
348 |
349 | assert len(code_list_v0) == len(testcases_list) == len(problem_id_list) == len(override_flags_list_v0) == len(code_list_v1) == len(override_flags_list_v1)
350 | assert timing_env in ['gem5', 'binary', 'both']
351 | assert len(code_list_v0) > 0
352 | assert len(testcases_list) > 0
353 | assert len(problem_id_list) > 0
354 | assert all([len(code) > 0 for code in code_list_v0])
355 | assert all([len(code) > 0 for code in code_list_v1])
356 | assert all([len(testcases) > 0 for testcases in testcases_list])
357 |
358 | results = multiple_dual_submissions(code_list_v0, code_list_v1, testcases_list, problem_id_list, timing_env, QUEUE, N_CPUS, override_flags_list_v0, override_flags_list_v1)
359 | return jsonify(results)
360 |
361 | @app.route('/gem5/ping', methods=['GET'])
362 | def Ping():
363 | return jsonify({"status": "ok"})
364 |
365 |
366 | if __name__ == '__main__':
367 | args = parse_args()
368 | init_globals(args.workers, args.use_logical_cpus)
369 | app.run(host="0.0.0.0", port=args.port, debug=args.debug)
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
--------------------------------------------------------------------------------
/gem5/benchmarking.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pandas as pd
3 | import shutil
4 | import os
5 | import warnings
6 | import traceback
7 | import logging
8 | import subprocess
9 | import glob
10 | import re
11 | import traceback
12 | import time
13 | import shlex
14 | from typing import Optional, List, Tuple, Dict, Any, Union
15 | import multiprocessing
16 | from collections import defaultdict
17 | import json
18 | import resource
19 | import re
20 | import ast
21 | from dataclasses import dataclass
22 |
23 | logging.basicConfig(level=logging.DEBUG)
24 | logging.getLogger("resource").setLevel(logging.DEBUG)
25 |
26 | MAX_VIRTUAL_MEMORY = 10 * 1024 * 1024 * 50 # 500 MB
27 |
28 | # from https://gist.github.com/s3rvac/f97d6cbdfdb15c0a32e7e941f7f4a3fa
29 | def limit_virtual_memory():
30 | resource.setrlimit(resource.RLIMIT_AS, (MAX_VIRTUAL_MEMORY, MAX_VIRTUAL_MEMORY * 10))
31 |
32 |
33 | def get_accuracy(output: str, ground_truth: str) -> float:
34 | """
35 | Compare the output of the code with the ground truth.
36 | """
37 | num_correct = 0
38 | ground_truth_lines = ground_truth.strip().splitlines()
39 | output_truth_lines = output.strip().splitlines()
40 | for gen_output, ground_truth_output in zip(output_truth_lines, ground_truth_lines):
41 | is_corr = gen_output == ground_truth_output
42 | if not is_corr:
43 | try:
44 | gen_output = float(gen_output)
45 | ground_truth_output = float(ground_truth_output)
46 | is_corr = abs(gen_output - ground_truth_output) < 1e-3
47 | except:
48 | pass
49 | num_correct += int(is_corr)
50 |
51 | return num_correct / len(ground_truth_lines)
52 |
53 | def compile_cpp_code(code_path: str, timeout: int = 30, output_path: str = None, cflags: str = "--std=c++17 -O3", cpu_number: Optional[int] = None) -> str:
54 | """_summary_
55 |
56 | Args:
57 | code_path (str): _description_
58 | output_path (str, optional): _description_
59 | cflags (str, optional): _description_
60 |
61 | Returns:
62 | str: _description_
63 | """
64 | if output_path is None:
65 | output_path = os.path.join(os.path.dirname(code_path), f"{os.path.splitext(os.path.basename(code_path))[0]}.out")
66 | cpu_cmd = f"taskset --cpu-list {cpu_number}" if cpu_number is not None else ""
67 |
68 | cmd = shlex.split(cpu_cmd) + ["/usr/bin/g++", code_path, "-o", output_path] + shlex.split(cflags.replace('"', "").replace("'", ""))
69 | logging.critical(f"Running command: {' '.join(cmd)}")
70 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
71 | if p.returncode != 0:
72 | raise Exception(f"Error compiling code: {code_path} with command: {' '.join(cmd)}, return code: {p.returncode}, stderr: {p.stderr}")
73 | else:
74 | # sometimes there can be latency in the file system, so we wait a bit
75 | while(not os.path.exists(output_path)):
76 | time.sleep(0.05)
77 | return output_path
78 |
79 | def exec_bin(bin_path, in_path, timeout, cpu_number=None):
80 | logging.info(f'executing {bin_path}, with input {in_path}')
81 | if in_path is not None:
82 | fh = open(in_path, 'r')
83 | else:
84 | fh = subprocess.DEVNULL
85 | cmd = [bin_path]
86 | if cpu_number is not None:
87 | cmd = ["taskset", "--cpu-list", str(cpu_number)] + cmd
88 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, stdin=fh, text=True)
89 | if in_path is not None:
90 | fh.close()
91 | return p.returncode, p.stdout, p.stderr
92 |
93 | def exec_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, in_path, stats_out_path, timeout: str = None, cpu_number=None):
94 | gem5_bin = os.path.join(gem5_dir, 'gem5.opt')
95 | cmd = shlex.split(f"{gem5_bin} --stats-file={stats_out_path} {gem5_script_path} {cpu_type} {bin_path}")
96 | if cpu_number is not None:
97 | cmd = ["taskset", "--cpu-list", str(cpu_number)] + cmd
98 | if in_path is not None:
99 | logging.info(f'executing {" ".join(cmd)}, with input {in_path}')
100 | with open(in_path, 'r') as fh:
101 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, stdin=fh, text=True)
102 | else:
103 | logging.info(f'executing {" ".join(cmd)}, with no input')
104 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
105 | return p.returncode, p.stdout, p.stderr
106 |
107 | def exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout=None):
108 | logging.info(f'executing {bin_path}, with input {in_path}')
109 | with open(in_path, 'r') as fh:
110 | p = subprocess.run([bin_path], capture_output=True, timeout=timeout, stdin=fh, text=True)
111 | if p.returncode != 0:
112 | raise Exception(f"Error executing code: {bin_path}, return code: {p.returncode}, stderr: {p.stderr.decode('utf-8')}")
113 | else:
114 | return get_accuracy(p.stdout, ground_truth_output)
115 |
116 | def compile_and_check_outputs(code_path, problem_id, testcases_dir, timeout=None, cflags: str ="--std=c++17 -O3", testcases: List[int] = None, cpu_number=None):
117 |
118 | input_output_pairs = {}
119 | input_paths = glob.glob(os.path.join(testcases_dir, problem_id, f"input.*.txt"))
120 | for in_path in input_paths:
121 | tc_no = re.search(r"input\.(\d+)\.txt", in_path).group(1)
122 | if testcases is not None and int(tc_no) not in testcases and tc_no not in testcases: # allow both int and str
123 | continue
124 | out_path = os.path.join(testcases_dir, problem_id, f"output.{tc_no}.txt")
125 | input_output_pairs[tc_no] = (in_path, out_path)
126 | logging.info(f"Found {len(input_output_pairs)} testcases for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
127 | try:
128 | bin_path = compile_cpp_code(code_path, timeout, cflags=cflags, cpu_number=cpu_number)
129 | logging.info(f"Compiled {code_path} to {bin_path}")
130 | except Exception as e:
131 | return None, {tc_no: 0 for tc_no in input_output_pairs.keys()}
132 |
133 | accs = {}
134 |
135 | for tc_no, (in_path, out_path) in input_output_pairs.items():
136 | with open(out_path, 'r') as fh:
137 | ground_truth_output = fh.read().strip()
138 | try:
139 | acc = exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout)
140 | accs[tc_no] = acc
141 | except Exception as e:
142 | logging.error(f"Error executing code: {bin_path} with input: {in_path}, error: {e}")
143 | accs[tc_no] = 0
144 |
145 | logging.info(f"bin_path: {bin_path}, accs: {accs}")
146 |
147 | return bin_path, accs
148 |
149 | def compile_and_check_outputs_multi(
150 | code_paths,
151 | problem_ids,
152 | testcases_dir,
153 | timeout=None,
154 | cflags: str ="--std=c++17 -O3",
155 | test_cases_list = None,
156 | cpu_number=None):
157 | if test_cases_list is None:
158 | test_cases_list = [None for _ in range(len(code_paths))]
159 | code2results = defaultdict(dict)
160 | for code_path, problem_id, test_cases in zip(code_paths, problem_ids, test_cases_list):
161 | bin_path, accs = compile_and_check_outputs(code_path, problem_id, testcases_dir, timeout, cflags, test_cases, cpu_number)
162 | code2results[code_path]["compile_success"] = bin_path is not None
163 | code2results[code_path]["bin_path"] = bin_path
164 | code2results[code_path]["accs"] = accs
165 | return code2results
166 |
167 |
168 | def calc_sim_seconds(stats):
169 | return float(stats["sim_ticks"]) / float(stats["sim_freq"]) # more accurate than sim_seconds
170 |
171 |
172 | def parse_stats_txt(stats_path):
173 | with open(stats_path, 'r') as f:
174 | stats_lines = f.readlines()
175 |
176 | stats = {}
177 | for line in stats_lines:
178 | if line.strip() == '':
179 | continue
180 | if "Begin" in line:
181 | continue
182 | if "End" in line:
183 | continue
184 | line = re.sub("#.*", "", line).strip() # remove comments
185 | parts = line.split()
186 | parts = [part.strip() for part in parts]
187 | if len(parts) > 2:
188 | value = parts[1:]
189 | elif len(parts) == 2:
190 | value = parts[1]
191 | else:
192 | logging.warn(f'could not parse line {line}')
193 | continue
194 | key = parts[0]
195 | if isinstance(value, str):
196 | try:
197 | value = value.replace("%", "").replace("nan", "None").replace("inf", "None").replace("-inf", "None")
198 | value = ast.literal_eval(value) if value != "None" else None
199 | except:
200 | logging.warn(f"could not parse value {value} for key {key}")
201 | elif isinstance(value, list):
202 | try:
203 | value = [v.replace("%", "").replace("nan", "None").replace("inf", "None").replace("-inf", "None") for v in value]
204 | value = [ast.literal_eval(v) if v != "None" else None for v in value]
205 | except:
206 | logging.warn(f"could not parse value {value} for key {key}")
207 | stats[key] = value
208 | stats["sim_seconds_precise"] = calc_sim_seconds(stats)
209 | return stats
210 |
211 |
212 | def run_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, problem_id, testcases_dir, timeout, testcases: List[int] = None, cpu_number=None, exit_early_on_fail=True):
213 | input_paths = glob.glob(os.path.join(testcases_dir, problem_id, f"input.*.txt"))
214 | tc_2_in_path = {}
215 | logging.info(f"Found {len(input_paths)} total testcases for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
216 | for in_path in input_paths:
217 | tc_no = int(re.search(r"input\.(\d+)\.txt", in_path).group(1))
218 | if testcases is not None and str(tc_no) not in testcases and tc_no not in testcases:
219 | continue
220 | tc_2_in_path[tc_no] = in_path
221 | logging.info(f"Found {len(tc_2_in_path)} testcases to actually run for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
222 | tc_2_results = {}
223 | any_incorrect_or_timeout = False
224 | logging.critical(f"Running {bin_path} on testcases: {tc_2_in_path.keys()}")
225 | for tc_no, in_path in tc_2_in_path.items():
226 | # logging.critical(f"Running {bin_path} on testcase {tc_no} with input {in_path}")
227 | #### TOOD: MAKE SURE ALL CODE/BINARIES ARE IN UNIQUE DIRECTORIES
228 | stats_out_path = os.path.splitext(bin_path)[0] + f".{tc_no}.txt"
229 | if exit_early_on_fail and any_incorrect_or_timeout:
230 | tc_2_results[tc_no] = {"success": False, "error": "Previous testcase was incorrect or timed out, so skipping this testcase",
231 | "stats": None, "stdout": None, "stderr": None, "time": None}
232 | else:
233 | try:
234 | returncode, stdout, stderr = exec_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, in_path, stats_out_path, timeout, cpu_number=cpu_number)
235 | if returncode != 0:
236 | tc_2_results[tc_no] = {"success": False, "error": f"Error executing code: {bin_path}, return code: {returncode}, stderr: {stderr.decode('utf-8')}",
237 | "stats": None, "stdout": stdout, "stderr": stderr, "time": None}
238 | any_incorrect_or_timeout = True
239 | else:
240 | tc_2_results[tc_no] = {"success": True, "error": None, "stats": parse_stats_txt(stats_out_path), "stdout": stdout, "stderr": stderr, "time": parse_stats_txt(stats_out_path)["sim_seconds_precise"]}
241 | except Exception as e:
242 | traceback_err = traceback.format_exc()
243 | tc_2_results[tc_no] = {"success": False, "error": f"Error executing code: {bin_path}, error: {e}, traceback: {traceback_err}",
244 | "stats": None, "stdout": None, "stderr": None, "time": None}
245 | any_incorrect_or_timeout = True
246 | return tc_2_results
247 |
248 |
249 | def run_gem5_multi(gem5_dir, gem5_script_path, cpu_type, bin_paths, problem_ids, testcases_dir, timeout, test_cases_list: List[int] = None, cpu_number=None, exit_early_on_fail=True):
250 | if test_cases_list is None:
251 | test_cases_list = [None for _ in range(len(bin_paths))]
252 | bin2results = defaultdict(dict)
253 | for bin_path, problem_id, test_cases in zip(bin_paths, problem_ids, test_cases_list):
254 | bin2results[bin_path] = run_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, problem_id, testcases_dir, timeout, test_cases, cpu_number, exit_early_on_fail)
255 | return bin2results
256 |
257 | #### hyperfine
258 |
259 | FSTREAM_HEADER="#include " # for redirecting io
260 |
261 | CPP_HEADERS=[FSTREAM_HEADER]
262 |
263 | def make_redirect_io_cpp(testcase_path, output_path=None):
264 | lines = f"\nstd::ifstream cin(\"{testcase_path}\");\n"
265 | if output_path:
266 | lines = lines + f"std::ofstream cout(\"{output_path}\");\n\n"
267 | return lines
268 |
269 | def add_headers_cpp(code_str):
270 | for header in CPP_HEADERS:
271 | if header not in code_str:
272 | code_str = header + "\n" + code_str
273 | return code_str
274 |
275 |
276 | def insert_io_redirects_cpp(code_str, path_to_testcases, path_to_outputs=None):
277 | import re
278 | ## match all whitespace after main and include that in the match greedy
279 | m = re.search("main(\s*)[^\{}]*{", code_str)
280 | if m is None:
281 | raise ValueError("No main function found")
282 | insert_idx = m.end()
283 | io_redirects = make_redirect_io_cpp(path_to_testcases, path_to_outputs)
284 | return code_str[:insert_idx] + io_redirects + code_str[insert_idx:]
285 |
286 |
287 | def redirect_cpp_io(code_str, path_to_testcases, path_to_outputs=None):
288 | code_str = add_headers_cpp(code_str)
289 | code_str = insert_io_redirects_cpp(code_str, path_to_testcases, path_to_outputs)
290 | return code_str
291 |
292 |
293 | def redirect_cpp_io_file(code_path, stdin_path, stdout_path=None, new_code_dir=None):
294 | input_basename = os.path.splitext(os.path.basename(stdin_path))[0].replace(".", "_")
295 | if new_code_dir is None:
296 | new_code_dir = os.path.dirname(code_path)
297 | if stdout_path is None:
298 | basename = os.path.splitext(os.path.basename(code_path))[0]
299 | stdout_path = os.path.join(new_code_dir, f"{basename}_{input_basename}.stdout")
300 | with open(code_path, "r") as f:
301 | code_str = f.read()
302 | code_str = redirect_cpp_io(code_str, stdin_path, stdout_path)
303 | new_code_path = os.path.join(new_code_dir, f"redirected_{input_basename}_{os.path.basename(code_path)}")
304 | with open(new_code_path, "w") as f:
305 | f.write(code_str)
306 | return new_code_path, stdout_path
307 |
308 |
309 | def redirect_cpp_io_and_compile(code_path, stdin_path, cpu_number=None, new_code_dir=None, stdout_path=None, cflags="--std=c++17 -O3"):
310 | new_code_path, stdout_path = redirect_cpp_io_file(code_path, stdin_path, new_code_dir, stdout_path)
311 | new_binary_path = compile_cpp_code(new_code_path, cpu_number=cpu_number, cflags=cflags)
312 | return new_binary_path, new_code_path, stdout_path
313 |
314 |
315 | ## physical / logical cpu management
316 |
317 | def get_physical_cpu_list():
318 | cmd = " grep -E '^processor|^physical id|^core id' /proc/cpuinfo "
319 | output = os.popen(cmd).read()
320 | output = output.split("processor")
321 | output = [x for x in output if x]
322 | physical2logical = defaultdict(list)
323 | n_logical = 0
324 | for cpu_info in output:
325 | logical_id = re.search("(?<=\t: )\d+", cpu_info).group(0)
326 | physical_id = re.search("(?<=core id\t\t: )\d+", cpu_info).group(0)
327 | physical2logical[int(physical_id)].append(int(logical_id))
328 | n_logical += 1
329 | n_physical = len(physical2logical)
330 | from pprint import pformat
331 | logging.info(f"Physical CPU (n={n_physical}) to Logical CPU (n={n_logical}) mapping:")
332 | logging.info(pformat(sorted(dict(physical2logical).items(), key=lambda x: int(x[0]))))
333 | unique_logical_ids = []
334 | for physical_id, logical_ids in physical2logical.items():
335 | unique_logical_ids.append(logical_ids[0])
336 | logging.info(f"The set of logical ids available for use (n={len(unique_logical_ids)}):")
337 | logging.info(unique_logical_ids)
338 | return unique_logical_ids
339 |
340 | def add_logicial_cpus_to_queue(num_processes, queue):
341 | highest_num_processes = multiprocessing.cpu_count()
342 | if num_processes < 0:
343 | num_processes = highest_num_processes
344 | else:
345 | if num_processes > highest_num_processes:
346 | raise ValueError(f"num_processes {num_processes} is greater than the highest available cpu: {highest_num_processes}.")
347 | available_cpus = list(range(num_processes))
348 | if len(available_cpus) > 2:
349 | available_cpus = available_cpus[:-2]
350 | else:
351 | logging.warning(f"there are fewer than 3 logical CPUs which is not recommended")
352 | for cpu_id in available_cpus:
353 | queue.put(cpu_id)
354 | logging.info(f"List of cpus to be used: {available_cpus}")
355 | return available_cpus
356 |
357 | def add_physical_cpus_to_queue(num_processes, queue):
358 | available_cpus = [i for i in get_physical_cpu_list() if i >= 0]
359 | if len(available_cpus) > 2:
360 | available_cpus = available_cpus[:-2]
361 | else:
362 | logging.warning(f"there are fewer than 3 physical CPUs which is not recommended")
363 | if num_processes < 0:
364 | num_processes = len(available_cpus)
365 | elif len(available_cpus) < num_processes:
366 | raise ValueError(f"Only {len(available_cpus)} available cpus, but {num_processes} processes requested; the set of available cpus is {available_cpus}")
367 | for cpu_id in available_cpus[:num_processes]:
368 | queue.put(cpu_id)
369 | logging.info(f"List of cpus to be used: {available_cpus[:num_processes]}")
370 | return available_cpus
371 |
372 | def run_benchmark(args, json_output_path, timeout_seconds: int = 60) -> Union[str, None]:
373 | try:
374 | logging.info(f"Running {' '.join(args)}")
375 | proc = subprocess.Popen(
376 | args,
377 | preexec_fn=limit_virtual_memory,
378 | # stderr=subprocess.DEVNULL,
379 | # stdout=subprocess.DEVNULL
380 | )
381 | output = proc.communicate(timeout=timeout_seconds)[0]
382 | if os.path.exists(json_output_path):
383 | results = json.load(open(json_output_path)).get("results", [])
384 | return results, output
385 | else:
386 | return None, output
387 | except subprocess.TimeoutExpired:
388 | logging.warning(f"Timeout for {args}")
389 | _kill(proc.pid) # type: ignore
390 | return None, f"Timeout after {timeout_seconds} seconds"
391 | except json.decoder.JSONDecodeError:
392 | logging.warning(f"JSONDecodeError for {args}")
393 | return None, f"JSONDecodeError"
394 | except KeyboardInterrupt as e:
395 | _kill(proc.pid) # type: ignore
396 | raise e
397 |
398 |
399 | def run_hyperfine(code_paths: List[str],
400 | problem_ids: List[str],
401 | path_to_testcases: str,
402 | json_out_path: str, # TODO REMOVE json_out_path
403 | test_cases_list: List[int] = None,
404 | min_runs_per_test_case: int = None,
405 | max_runs_per_test_case: int = None,
406 | strict_runs_per_test_case: bool = False,
407 | warmup_runs_per_test_case: int = 5,
408 | cpu_number: int = None,
409 | do_sanity_check: bool = False,
410 | cflags: str = "--std=c++17 -O3"):
411 | """
412 | will benchmark all in 1 json / 1 run of hyperfine, all on the same cpu
413 | """
414 |
415 | ### TODO: need to change to handle compilation errors and timeouts
416 |
417 | code2benchmarks = defaultdict(list)
418 | benchmark2code = {}
419 | code2results = defaultdict(dict)
420 | code2testcases = defaultdict(list)
421 | if test_cases_list is None:
422 | test_cases_list = [None] * len(code_paths)
423 | for code_path, problem_id, test_case_list in zip(code_paths, problem_ids, test_cases_list):
424 | problem_dir = os.path.join(path_to_testcases, problem_id)
425 | testcases_paths = glob.glob(os.path.join(problem_dir, "input.*.txt"))
426 | if test_case_list is not None:
427 | testcases_paths = [t for t in testcases_paths if int(re.search("(?<=input\.)\d+", t).group(0)) in test_case_list]
428 | test_case_numbers = [int(re.search("(?<=input\.)\d+", t).group(0)) for t in testcases_paths]
429 | code2testcases[code_path] = test_case_numbers
430 | for testcase_path in testcases_paths:
431 | bin_redirect, code_redirect, _ = redirect_cpp_io_and_compile(code_path,
432 | testcase_path,
433 | cpu_number=cpu_number,
434 | cflags=cflags)
435 | code2benchmarks[code_path].append(bin_redirect)
436 | benchmark2code[bin_redirect] = code_path
437 |
438 | cmds = " ".join([bin_redirect for bin_redirects in code2benchmarks.values() for bin_redirect in bin_redirects])
439 | n_cmds = len(cmds.split(" "))
440 | if strict_runs_per_test_case:
441 | assert min_runs_per_test_case is not None
442 | runs_str = f" --runs {min_runs_per_test_case}"
443 | else:
444 | runs_str = ""
445 | if min_runs_per_test_case is not None:
446 | runs_str += f" --min-runs {min_runs_per_test_case}"
447 | if max_runs_per_test_case is not None:
448 | runs_str += f" --max-runs {max_runs_per_test_case}"
449 | if warmup_runs_per_test_case is not None:
450 | runs_str += f" --warmup {warmup_runs_per_test_case}"
451 |
452 | cmd_benchmark = (
453 | f"hyperfine {runs_str} -N {cmds} --export-json {json_out_path} "
454 | )
455 |
456 | if cpu_number is not None:
457 | cmd_benchmark = f"taskset --cpu-list {cpu_number} {cmd_benchmark}"
458 |
459 | if do_sanity_check:
460 | SANITY_CHECK_TIMEOUT = 1.5 * n_cmds
461 | cmd_sanity_check = cmd_benchmark.replace(runs_str, f" --runs 2 --warmup 1 ")
462 | p = subprocess.run(shlex.split(cmd_sanity_check), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=SANITY_CHECK_TIMEOUT, encoding="utf-8")
463 | if p.returncode != 0:
464 | return None, f"Sanity check failed for {cmd_sanity_check}: {p.stderr}"
465 | results, output = run_benchmark(shlex.split(cmd_benchmark), json_out_path)
466 |
467 | for result in results:
468 | command = result["command"]
469 | tc_no = int(re.search("(?<=input\_)\d+", command).group(0))
470 | code2results[benchmark2code[command]][tc_no] = result
471 | for bin, code in benchmark2code.items():
472 | results = code2results[code]
473 | missing_tcs = set(code2testcases[code]) - set(results.keys())
474 | for tc_no in missing_tcs:
475 | results[tc_no] = None
476 | return code2results, output
477 |
478 |
479 |
480 |
481 |
--------------------------------------------------------------------------------
/gem5/gem5_eval.py:
--------------------------------------------------------------------------------
1 | # from src.codenet_eval.run_eval import (read_ground_truths, read_inputs_and_prepare)
2 | # from src.codenet_eval.evalconfig import EvaluationConfig
3 | import tarfile
4 | import shutil
5 | import tempfile
6 | import logging
7 | import pandas as pd
8 | import json
9 | import os
10 | import pdb
11 | import argparse
12 | from gem5.simulator import PieEnvironment
13 | from gem5 import simulator
14 | import traceback
15 | import pdb
16 | import threading
17 | from tqdm import tqdm
18 | import re
19 | from typing import Optional, Any
20 | import yaml
21 | from dataclasses import dataclass, field
22 | import ast
23 |
24 | logging.basicConfig(level=logging.INFO)
25 |
26 | import signal
27 | import time
28 |
29 | KEY_COLS = ["n_tests",
30 | "problem_id",
31 | "tests"
32 | "src_id",
33 | "tgt_id",
34 | "fastest_runtime", "fastest_accuracy"]
35 |
36 |
37 | def get_key_columns(df, cfg):
38 | ## in key columns or if
39 | ## *_test_compilation, *_test_accuracy, *_test_agg_runtime, *_tc2time
40 | key_cols = [c for c in df.columns if c in KEY_COLS or c.endswith("_compilation") or c.endswith("_accuracy") or c.endswith("_runtime") or c.endswith("_tc2time")]
41 | key_cols += [c for c in df.columns if cfg.model_generated_potentially_faster_code_col in c] + [cfg.slow_code_col, cfg.reference_code_col]
42 | key_cols = list(set(key_cols))
43 | return df[key_cols]
44 |
45 | def _fix_value(x: Any) -> Any:
46 | ## if starts with '[' and ends with ']', as a string, then convert to list
47 | if isinstance(x, str) and len(x) > 1 and x[0] == '[' and x[-1] == ']':
48 | x = ast.literal_eval(x)
49 | return x
50 |
51 | def fix_df_columns(df):
52 | for col in df.columns:
53 | df[col] = df[col].apply(lambda x: _fix_value(x))
54 | return df
55 |
56 |
57 |
58 | def unmelt_results(results_df, cfg, remove_extra_cols=False):
59 | unmelted_data = []
60 | for src_id, group in results_df.groupby("src_id"):
61 | src_code_row = group[group["code_type"] == "src_code"].iloc[0]
62 | new_row = src_code_row.to_dict()
63 | for index, row in group.iterrows():
64 | new_row["src_id"] = src_id
65 | new_row[f'{row["code_type"]}_compilation'] = row["compilation"]
66 | new_row[f'{row["code_type"]}'] = row["code"]
67 | if row["code_type"].startswith(cfg.model_generated_potentially_faster_code_col) or cfg.redo_src_tgt:
68 | new_row[f'{row["code_type"]}_accuracy'] = row["accuracy"]
69 | new_row[f'{row["code_type"]}_agg_runtime'] = row["agg_runtime"]
70 | new_row[f'{row["code_type"]}_tc2time'] = row["tc2time"]
71 | unmelted_data.append(new_row)
72 | ## clean up the column names
73 | unmelted_df = pd.DataFrame(unmelted_data)
74 | if remove_extra_cols:
75 | unmelted_df = get_key_columns(unmelted_df, cfg)
76 |
77 | # unmelted_df = rename_columns(unmelted_df)
78 |
79 | return unmelted_df
80 |
81 | def report_results(df, cfg, orig_df):
82 | ## all columns will be cfg.model_generated_potentially_faster_code_col_*
83 | ## for these, consider only use those that are not None, above threshold_accuracy, and have the fastest_runtime
84 | ## for those, keep the runtime, but if the accuracy is below threshold_accuracy, set the runtime to float("inf")
85 |
86 | ## then consider only max_generations_to_report
87 |
88 | ## in 1, 2, 4... (powers of 2 up until len(runtimes)), report the best runtime
89 | ## as runtime_best@1, runtime_best@2, runtime_best@4, etc. accuracy_best@1, accuracy_best@2, accuracy_best@4, etc.
90 | ## while also reporting speedup_best@1, speedup_best@2, speedup_best@4, etc. where speedup = runtime_src / runtime_best@n
91 |
92 |
93 | ## then aggregate
94 | ### 1. for each 1, 2.. (powers of 2 up until len(runtimes)), report mean_accuracy@n, mean_speedup@n where we also take speedup = min(1.0, runtime_src / runtime_best@n)
95 | ### 2. for each 1, 2.. (powers of 2 up until len(runtimes)), report the % of programs where the speedup is >= 1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0
96 |
97 | # merged[f"{cfg.model_generated_potentially_faster_code_col}_{i}"] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x[i] if i < len(x) else None)
98 | import pdb
99 | # pdb.set_trace()
100 | # print("columns before report_results")
101 | # print(df.columns)
102 |
103 |
104 | # num_generated_cols = len([c for c in df.columns if re.match(f"{cfg.model_generated_potentially_faster_code_col}_[0-9]+", c) or c == cfg.model_generated_potentially_faster_code_col])
105 | num_generated_cols = cfg.num_generated_cols
106 | assert num_generated_cols is not None, f"num_generated_cols is None, it should have been set in read_inputs_and_prepare_v2"
107 |
108 | import pandas as pd
109 | import numpy as np
110 |
111 | # Assuming orig_df and df are already defined, and cfg and num_generated_cols are given
112 |
113 | # Step 1: Find rows in orig_df that are not in df
114 | # do this with src_code not src_id
115 | print(f"length of orig_df {len(orig_df)} vs length of results_df {len(df)}")
116 | orig_df["src_tgt_code"] = orig_df[cfg.slow_code_col] + orig_df[cfg.reference_code_col]
117 | df["src_tgt_code"] = df[cfg.slow_code_col] + df[cfg.reference_code_col]
118 | # drop duplicates from both
119 | df = df.drop_duplicates(subset=["src_tgt_code"])
120 | orig_df = orig_df.drop_duplicates(subset=["src_tgt_code"])
121 | unique_rows = orig_df[~orig_df['src_tgt_code'].isin(df['src_tgt_code'])]
122 | assert len(unique_rows) == (len(orig_df) - len(df)), f"len(unique_rows) {len(unique_rows)} == len(orig_df) - len(df) {len(orig_df) - len(df)}"
123 |
124 | # Step 2: Create additional columns for the unique rows and set default values
125 | for j in range(num_generated_cols + 1): # Adding 1 to include the case when j == num_generated_cols
126 | colname = f"{cfg.model_generated_potentially_faster_code_col}_{j}" if num_generated_cols > 0 else cfg.model_generated_potentially_faster_code_col
127 | unique_rows[f"{colname}_agg_runtime"] = float("inf") # Setting runtime to inf
128 | unique_rows[f"{colname}_accuracy"] = 0 # Setting accuracy to 0
129 | unique_rows[f"{colname}_tc2time"] = [{} for _ in range(len(unique_rows))] # Setting tc2time to {}
130 | # drop unique rows columns that are not in df
131 | unique_rows = unique_rows[[c for c in unique_rows.columns if c in df.columns]]
132 |
133 | # Step 3: Append the modified unique rows to df
134 | df = pd.concat([df, unique_rows], ignore_index=True)
135 |
136 | print(f"columns after appending {df.columns}")
137 | print(f"unique rows columns {unique_rows.columns}")
138 | assert len(df) == 978, f"len(df) {len(df)} == 978"
139 |
140 | new_rows = []
141 | for i, row in df.iterrows():
142 | for j in range(num_generated_cols):
143 | colname = f"{cfg.model_generated_potentially_faster_code_col}_{j}" if num_generated_cols > 0 else cfg.model_generated_potentially_faster_code_col
144 | if row[colname] is None or pd.isna(row[colname]) or pd.isnull(row[colname]):
145 | row[f"{colname}_agg_runtime_adjusted"] = float("inf")
146 | if row[f"{colname}_accuracy"] < cfg.threshold_accuracy:
147 | row[f"{colname}_agg_runtime_adjusted"] = float("inf")
148 | else:
149 | row[f"{colname}_agg_runtime_adjusted"] = row[f"{colname}_agg_runtime"]
150 | row["fastest_generated_agg_runtime"] = min([row[f"{cfg.model_generated_potentially_faster_code_col}_{j}_agg_runtime_adjusted"] for j in range(num_generated_cols)])
151 | new_rows.append(row)
152 |
153 | df = pd.DataFrame(new_rows)
154 |
155 | problem_id_to_fastest_agg_runtime = {}
156 | problem_id_to_fastest_correctness = {}
157 | for i, group in df.groupby("problem_id"):
158 | problem_id_to_fastest_agg_runtime[i] = group["fastest_generated_agg_runtime"].min()
159 | problem_id_to_fastest_correctness[i] = problem_id_to_fastest_agg_runtime[i] < float("inf")
160 |
161 | df["fastest_generated_runtime_over_all_submissions"] = df["problem_id"].apply(lambda x: problem_id_to_fastest_agg_runtime[x])
162 | df["fastest_generated_speedup_over_all_submissions"] = df[cfg.slow_code_col+"_agg_runtime"] / df["fastest_generated_runtime_over_all_submissions"]
163 | df["fastest_generated_speedup_over_all_submissions"] = df["fastest_generated_speedup_over_all_submissions"].apply(lambda x: max(1.0, x))
164 | df["fastest_generated_correctness_over_all_submissions"] = df["problem_id"].apply(lambda x: problem_id_to_fastest_correctness[x])
165 |
166 |
167 | for i in range(1, num_generated_cols+1):
168 | if num_generated_cols == 0:
169 | df[f"agg_runtime_best@{i}"] = df[f"{cfg.model_generated_potentially_faster_code_col}_agg_runtime_adjusted"]
170 | df[f"accuracy_best@{i}"] = df[f"{cfg.model_generated_potentially_faster_code_col}_accuracy"]
171 | df[f"is_correct_best@{i}"] = df[f"accuracy_best@{i}"] == cfg.threshold_accuracy
172 | else:
173 | df[f"agg_runtime_best@{i}"] = df[[f"{cfg.model_generated_potentially_faster_code_col}_{j}_agg_runtime_adjusted" for j in range(i)]].min(axis=1)
174 | df[f"accuracy_best@{i}"] = df[[f"{cfg.model_generated_potentially_faster_code_col}_{j}_accuracy" for j in range(i)]].max(axis=1)
175 | df[f"is_correct_best@{i}"] = df[f"accuracy_best@{i}"] == cfg.threshold_accuracy
176 | df[f"speedup_best@{i}"] = df[cfg.slow_code_col+"_agg_runtime"] / df[f"agg_runtime_best@{i}"]
177 | df[f"speedup_best@{i}"] = df[f"speedup_best@{i}"].apply(lambda x: max(1.0, x))
178 | df["speedup_of_fastest_generated_of_all_submissions"] = df[cfg.slow_code_col+"_agg_runtime"] / df["fastest_generated_runtime_over_all_submissions"]
179 | df["speedup_of_fastest_generated_of_all_submissions"] = df["speedup_of_fastest_generated_of_all_submissions"].apply(lambda x: max(1.0, x))
180 |
181 | ## aggregate over all rows
182 | agg_df = pd.DataFrame(index=[0])
183 | # agg_df["fastest_generated_runtime_over_all_submissions"] = df["fastest_generated_runtime_over_all_submissions"].mean()
184 | agg_df["fastest_generated_correctness_over_all_submissions"] = df["fastest_generated_correctness_over_all_submissions"].mean()
185 | agg_df["fastest_generated_speedup_over_all_submissions"] = df["fastest_generated_speedup_over_all_submissions"].mean()
186 | # import pdb
187 | for i in range(1, num_generated_cols+1):
188 | # pdb.set_trace()
189 | agg_df[f"mean_accuracy_best@{i}"] = df[f"accuracy_best@{i}"].mean()
190 | agg_df[f"is_correct_best@{i}"] = df[f"is_correct_best@{i}"].mean()
191 | agg_df[f"mean_speedup_best@{i}"] = df[f"speedup_best@{i}"].mean()
192 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
193 | agg_df[f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}"] = (df[f"speedup_best@{i}"] >= speedup_threshold).mean()
194 |
195 | ## add the speedup of tgt_code over src_code and the threshold speedups of tgt_code over src_code
196 | df["speedup_tgt_over_src"] = df[cfg.slow_code_col+"_agg_runtime"] / df[cfg.reference_code_col+"_agg_runtime"]
197 | agg_df["mean_speedup_tgt_over_src"] = df["speedup_tgt_over_src"].mean()
198 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
199 | agg_df[f"percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}"] = (df["speedup_tgt_over_src"] >= speedup_threshold).mean()
200 | agg_df[f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}"] = (df["speedup_of_fastest_generated_of_all_submissions"] >= speedup_threshold).mean()
201 |
202 | ## pretty print out a report
203 |
204 | ## first print out the columns with asterisks separating fields *********
205 | print("********* Aggregated Results *********")
206 | for i in range(1, num_generated_cols+1):
207 | print(f"********* Results Best at {i} Generations *********")
208 | mean_accuracy = agg_df[f"mean_accuracy_best@{i}"][0]
209 | mean_speedup = agg_df[f"mean_speedup_best@{i}"][0]
210 |
211 | print(f"mean_accuracy_best@{i}: {mean_accuracy}")
212 | print(f"mean correctness best@{i}: {agg_df[f'is_correct_best@{i}'][0]}")
213 | print(f"mean_speedup_best@{i}: {mean_speedup} vs. mean_speedup_tgt_over_src: {agg_df['mean_speedup_tgt_over_src'][0]}")
214 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
215 | percent_programs = agg_df[f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}"][0]
216 | percent_programs_tgt_over_src = agg_df[f"percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}"][0]
217 | print(f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}: {percent_programs} vs. percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}: {percent_programs_tgt_over_src}")
218 | print("*****************************************")
219 | print("********* Results Fastest Generated Over All Submissions *********")
220 | print("mean correctness fastest_generated_over_all_submissions: ", agg_df["fastest_generated_correctness_over_all_submissions"][0])
221 | print("average fastest_generated_speedup_over_all_submissions: ", agg_df["fastest_generated_speedup_over_all_submissions"][0])
222 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
223 | percent_programs = agg_df[f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}"][0]
224 | print(f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}: {percent_programs}")
225 | print("********* End Aggregated Results *********")
226 |
227 | return agg_df, df
228 |
229 | # global env #: PieEnvironment
230 | global env
231 | env = None
232 |
233 | def sigint_handler(signum, frame):
234 | global env
235 | print("Ctrl-C pressed, running teardown...")
236 | if threading.current_thread().name == "MainThread":
237 | env.teardown()
238 | print("Teardown complete, exiting...")
239 | exit(0)
240 |
241 | # Set the signal handler for Ctrl+C (SIGINT)
242 | signal.signal(signal.SIGINT, sigint_handler)
243 |
244 |
245 |
246 | def read_inputs_and_prepare_v2(cfg) -> pd.DataFrame:
247 | """Reads the model generated output, the reference, joins them, and returns a dataframe with the merged data."""
248 | logging.info(f"Reading reference file from {cfg.reference_file_path}")
249 | logging.info(f"Reading model generated outputs from {cfg.model_generated_outputs_path}")
250 |
251 |
252 | gen_df = pd.read_json(
253 | cfg.model_generated_outputs_path, lines=True, orient="records"
254 | )
255 | gen_df = fix_df_columns(gen_df)
256 |
257 | logging.info(f"Read {len(gen_df)} rows from {cfg.model_generated_outputs_path}")
258 | if cfg.is_prompt_based:
259 | gen_df["slower_program"] = gen_df.apply(
260 | lambda x: get_input_from_prompt(x), axis=1
261 | )
262 | else:
263 | gen_df["slower_program"] = gen_df[cfg.slow_code_col].apply(lambda x: x.strip())
264 |
265 |
266 | assert (
267 | cfg.reference_code_col in gen_df.columns
268 | ), f"Column {cfg.reference_code_col} not found in {cfg.model_generated_outputs_path}"
269 | merged = gen_df
270 |
271 |
272 | merged = merged[merged[cfg.slow_code_col] != merged[cfg.reference_code_col]]
273 |
274 | assert (
275 | len(merged) > 0
276 | ), f"{cfg.slow_code_col} and {cfg.reference_code_col} are the same for all programs"
277 |
278 | if cfg.num_problems_to_evaluate != -1:
279 | merged = merged[: cfg.num_problems_to_evaluate]
280 |
281 |
282 | # if the generated code is a list, then we have multiple generations per input.
283 | # we add one column per generation
284 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], list) or isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], pd.Series) or (merged[cfg.model_generated_potentially_faster_code_col].iloc[0][0] == '[' and merged[cfg.model_generated_potentially_faster_code_col].iloc[0][-1] == ']'):
285 |
286 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], str):
287 | import ast
288 | merged[cfg.model_generated_potentially_faster_code_col] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: ast.literal_eval(x))
289 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], pd.Series):
290 | merged[cfg.model_generated_potentially_faster_code_col] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x.tolist())
291 | num_generations = max(merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: len(x)).tolist())
292 |
293 | for i in range(num_generations):
294 | merged[f"{cfg.model_generated_potentially_faster_code_col}_{i}"] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x[i] if i < len(x) else None)
295 | # so merged will have the same number of columns for all rows, but some rows will have None in some columns (because they have fewer generations)
296 | else:
297 | num_generations = 1
298 |
299 | cfg.num_generated_cols = num_generations
300 |
301 | return merged
302 |
303 |
304 |
305 | def main(cfg):
306 | # Step 0
307 | merged = read_inputs_and_prepare_v2(cfg)
308 | reference_df = pd.read_json(cfg.reference_file_path, lines=True, orient="records")
309 |
310 | logging.info(f"Number of programs to evaluate: {len(merged)}")
311 | logging.info(f"Input column: {cfg.slow_code_col}")
312 | logging.info(f"Reference column: {cfg.reference_code_col}")
313 | logging.info(f"Model generated column: {cfg.model_generated_potentially_faster_code_col}")
314 |
315 | # Step 1: Read the inputs
316 |
317 | # problem_id_to_ground_truths = read_ground_truths(cfg, merged)
318 |
319 | # Step 2: Write the inputs to a temporary directory
320 |
321 | tempdir = tempfile.TemporaryDirectory()
322 |
323 | ## we need to melt the dataframe from [slow, fast, generated_i] -> column of code_type and column of code
324 | generated_cols = []
325 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], list):
326 | generated_cols = [colname for colname in merged.columns if colname.startswith(cfg.model_generated_potentially_faster_code_col) and colname[-1].isdigit()]
327 | else:
328 | generated_cols = [cfg.model_generated_potentially_faster_code_col]
329 |
330 | logging.info(f"Generated columns: {generated_cols}")
331 | code_cols = [cfg.slow_code_col, cfg.reference_code_col] + generated_cols
332 |
333 | ##PATCH
334 | ## rename src_agg_runtime -> src_code_agg_runtime and tgt_agg_runtime -> tgt_code_agg_runtime
335 | if "src_agg_runtime" in merged.columns and "tgt_agg_runtime" in merged.columns:
336 | merged = merged.rename(columns={"src_agg_runtime": cfg.slow_code_col+"_agg_runtime", "tgt_agg_runtime": cfg.reference_code_col+"_agg_runtime"})
337 |
338 | melted = pd.melt(merged,
339 | value_vars=code_cols,
340 | var_name="code_type",
341 | value_name="code",
342 | id_vars = [c for c in merged.columns if c not in code_cols])
343 |
344 | orig_len = len(melted)
345 | #drop code na/null
346 | melted = melted.dropna(subset=["code"])
347 |
348 | # sort by "n_tests"
349 | melted = melted.sort_values(by=["n_tests"], ascending=False)
350 |
351 | if not os.path.exists(os.path.join(cfg.output_dir, "test_results.jsonl")):
352 | # drop any rows where the code length is 0
353 | melted = melted[melted["code"].apply(lambda x: len(x) > 0)]
354 | logging.info(f"Dropped {orig_len - len(melted)} rows with NA or empty code")
355 |
356 | if not cfg.redo_src_tgt:
357 | ## remove and cache the rows where code_type == "src_code" or "tgt_code"
358 | src_tgt_rows = melted[(melted["code_type"] == f"{cfg.slow_code_col}") | (melted["code_type"] == f"{cfg.reference_code_col}")]
359 | melted = melted[(melted["code_type"] != f"{cfg.slow_code_col}") & (melted["code_type"] != f"{cfg.reference_code_col}")]
360 | # pdb.set_trace()
361 | else:
362 | ## if we're re-running the src_code and tgt_code, then cache the old agg_runtimes
363 | orig_src_colname = cfg.slow_code_col.replace("_code", "_agg_runtime")
364 | orig_tgt_colname = cfg.reference_code_col.replace("_code", "_agg_runtime")
365 | new_src_colname = cfg.slow_code_col.replace("_code", "_original_agg_runtime")
366 | new_tgt_colname = cfg.reference_code_col.replace("_code", "_original_agg_runtime")
367 | melted.rename(columns={orig_src_colname: new_src_colname, orig_tgt_colname: new_tgt_colname}, inplace=True)
368 |
369 | print(f"Number of programs to evaluate after dropping NA: {len(melted)}")
370 | try:
371 | if not os.path.exists(cfg.output_dir):
372 | os.makedirs(cfg.output_dir)
373 | global env
374 | env = simulator.make(timeout_seconds_gem5=120, verbose=True, use_logical_cpus=True, port=8888, workers=40, exit_early_on_fail=True)
375 | ## iterate in batches of cpus_available, env.submit_mutliple_single_submissions() will submit the batch at once
376 | new_rows = []
377 | pbar = tqdm(total=len(melted), desc=f"Submitting {len(melted)} programs to evaluate", smoothing=0)
378 | if cfg.cpus_available == -1:
379 | cfg.cpus_available = len(melted)
380 | # legacy - we used to submit in batches
381 | batch = melted
382 | # currently sorting the list of tests in reverse order of length, so that the (potentially) longest tests are run first
383 | # this will may give more "conservative" estimates of the runtime with tqdm
384 | results = env.submit_multiple_single_submissions(batch["code"].tolist(),
385 | [sorted(list(t), reverse=True) for t in batch["tests"].tolist()],
386 | batch["problem_id"].tolist(),
387 | "gem5")
388 |
389 | # zip the rows and results together
390 | for (i, row), result in zip(batch.iterrows(), results):
391 | row["compilation"] = result.compilation
392 | row["accuracy"] = result.mean_acc
393 | row["agg_runtime"] = result.agg_runtime
394 | row["tc2time"] = result.tc2time
395 | row["tc2stats"] = result.tc2stats # this is a lot of data, toggle if we need all the outputs from gem5's stats.txt
396 | new_rows.append(row)
397 | # pbar.update(len(batch))
398 | melted = pd.DataFrame(new_rows)
399 | melted.to_json(
400 | f"{cfg.output_dir}/melted_test_results.jsonl",
401 | orient="records",
402 | lines=True
403 | )
404 | env.teardown()
405 | ## if we get an exception, we still want to teardown the environment because it will likely leave a docker container running
406 | except Exception as e:
407 | print(e)
408 | traceback.print_exc()
409 | if threading.current_thread().name == "MainThread":
410 | # global env
411 | env.teardown()
412 | raise e
413 |
414 | if not cfg.redo_src_tgt:
415 | ## add back the src_code and tgt_code rows
416 | melted = pd.concat([melted, src_tgt_rows])
417 |
418 | unmelted_df = unmelt_results(melted, cfg)
419 |
420 | unmelted_df.to_json(
421 | f"{cfg.output_dir}/test_results.jsonl",
422 | orient="records",
423 | lines=True
424 | )
425 | else:
426 | unmelted_df = pd.read_json(
427 | f"{cfg.output_dir}/test_results.jsonl",
428 | orient="records",
429 | lines=True
430 | )
431 |
432 | agg_df, result_df = report_results(unmelted_df, cfg, reference_df)
433 |
434 | agg_df.to_csv(
435 | f"{cfg.output_dir}/aggregated_results.csv",
436 | index=False
437 | )
438 |
439 | result_df.to_json(
440 | f"{cfg.output_dir}/addtl_stats.jsonl",
441 | orient="records",
442 | lines=True
443 | )
444 |
445 | print(f"Results written to {cfg.output_dir}")
446 |
447 |
448 | @dataclass
449 | class EvaluationConfig:
450 | model_generated_outputs_path: str
451 | output_dir: str
452 | reference_file_path: str
453 | is_prompt_based: bool = False
454 | model_generated_potentially_faster_code_col: str = "generated_answers"
455 | slow_code_col: str = "src_code"
456 | reference_code_col: str = "tgt_code"
457 | cpuset_cpus: Optional[str] = None
458 | do_eval: bool = False
459 | cpus_available: int = 1
460 | num_problems_to_evaluate: int = -1
461 | threshold_accuracy: float = 1.0
462 | redo_src_tgt: bool = False
463 | num_generated_cols: int = None
464 |
465 | def load_config(yaml_path: str) -> EvaluationConfig:
466 | with open(yaml_path, 'r') as f:
467 | config_dict = yaml.safe_load(f)
468 | return EvaluationConfig(**config_dict)
469 |
470 | if __name__ == "__main__":
471 | parser = argparse.ArgumentParser()
472 | parser.add_argument("--config_path", type=str, required=True)
473 | args = parser.parse_args()
474 | config = load_config(args.config_path)
475 | main(config)
--------------------------------------------------------------------------------