",
29 | "lstrip": false,
30 | "normalized": true,
31 | "rstrip": false,
32 | "single_word": false
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/finetuning/tokenizer_files/7B/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "additional_special_tokens": [
3 | "▁",
4 | "▁",
5 | "▁",
6 | "▁"
7 | ],
8 | "bos_token": {
9 | "content": "",
10 | "lstrip": false,
11 | "normalized": true,
12 | "rstrip": false,
13 | "single_word": false
14 | },
15 | "eos_token": {
16 | "content": "",
17 | "lstrip": false,
18 | "normalized": true,
19 | "rstrip": false,
20 | "single_word": false
21 | },
22 | "unk_token": {
23 | "content": "",
24 | "lstrip": false,
25 | "normalized": true,
26 | "rstrip": false,
27 | "single_word": false
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/finetuning/tokenizer_files/7B/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/finetuning/tokenizer_files/7B/tokenizer.model
--------------------------------------------------------------------------------
/finetuning/tokenizer_files/7B/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "add_bos_token": true,
3 | "add_eos_token": false,
4 | "additional_special_tokens": [
5 | "▁",
6 | "▁",
7 | "▁",
8 | "▁"
9 | ],
10 | "bos_token": {
11 | "__type": "AddedToken",
12 | "content": "",
13 | "lstrip": false,
14 | "normalized": true,
15 | "rstrip": false,
16 | "single_word": false
17 | },
18 | "clean_up_tokenization_spaces": false,
19 | "eos_token": {
20 | "__type": "AddedToken",
21 | "content": "",
22 | "lstrip": false,
23 | "normalized": true,
24 | "rstrip": false,
25 | "single_word": false
26 | },
27 | "eot_token": "▁",
28 | "fill_token": "",
29 | "legacy": null,
30 | "middle_token": "▁",
31 | "model_max_length": 1000000000000000019884624838656,
32 | "pad_token": null,
33 | "prefix_token": "▁",
34 | "sp_model_kwargs": {},
35 | "suffix_first": false,
36 | "suffix_token": "▁",
37 | "tokenizer_class": "CodeLlamaTokenizer",
38 | "unk_token": {
39 | "__type": "AddedToken",
40 | "content": "",
41 | "lstrip": false,
42 | "normalized": true,
43 | "rstrip": false,
44 | "single_word": false
45 | },
46 | "use_default_system_prompt": false
47 | }
48 |
--------------------------------------------------------------------------------
/finetuning/train.sh:
--------------------------------------------------------------------------------
1 | OUTPUT_DIR=${OUTPUT_DIR:-"saved_models/code_opt"}
2 | BASE_MODEL=${BASE_MODEL:-"codellama/CodeLlama-7b-hf"}
3 |
4 | torchrun --nproc_per_node=8 \
5 | --master_port=1234 finetune.py \
6 | --base_model $BASE_MODEL \
7 | --data_path ./data/ \
8 | --output_dir $OUTPUT_DIR \
9 | --batch_size 32 \
10 | --micro_batch_size 2 \
11 | --num_epochs 1 \
12 | --learning_rate 1e-5 \
13 | --cutoff_len 2000 \
14 | --train_on_inputs False \
15 | --prompt_template_name "code_opt" \
16 | --use_flash_attention True \
17 | --train_name "train.jsonl" \
18 | --val_name "val.jsonl" \
19 | --test_name "test.jsonl" \
20 | --wandb_project "code_opt" \
21 |
22 | # Copy tokenizer files to appropriate location, modify this if model is different
23 | if [[ $BASE_MODEL == *"7b"* ]]; then
24 | cp -r ./tokenizer_files/7B/* $OUTPUT_DIR
25 | elif [[ $BASE_MODEL == *"13b"* ]]; then
26 | cp -r ./tokenizer_files/13B/* $OUTPUT_DIR
27 | else
28 | echo "Base model size not recognized. Tokenizer files not copied."
29 | fi
30 |
--------------------------------------------------------------------------------
/finetuning/utils/convert_to_safetensors.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | import shutil
5 | from collections import defaultdict
6 | from inspect import signature
7 | from tempfile import TemporaryDirectory
8 | from typing import Dict, List, Optional, Set, Tuple
9 |
10 | import torch
11 |
12 | from huggingface_hub import CommitInfo, CommitOperationAdd, Discussion, HfApi, hf_hub_download
13 | from huggingface_hub.file_download import repo_folder_name
14 | from safetensors.torch import load_file, save_file
15 | from transformers import AutoConfig
16 |
17 |
18 | COMMIT_DESCRIPTION = """
19 | This is an automated PR created with https://huggingface.co/spaces/safetensors/convert
20 |
21 | This new file is equivalent to `pytorch_model.bin` but safe in the sense that
22 | no arbitrary code can be put into it.
23 |
24 | These files also happen to load much faster than their pytorch counterpart:
25 | https://colab.research.google.com/github/huggingface/notebooks/blob/main/safetensors_doc/en/speed.ipynb
26 |
27 | The widgets on your model page will run using this model even if this is not merged
28 | making sure the file actually works.
29 |
30 | If you find any issues: please report here: https://huggingface.co/spaces/safetensors/convert/discussions
31 |
32 | Feel free to ignore this PR.
33 | """
34 |
35 | ConversionResult = Tuple[List["CommitOperationAdd"], List[Tuple[str, "Exception"]]]
36 |
37 |
38 | class AlreadyExists(Exception):
39 | pass
40 |
41 |
42 | def shared_pointers(tensors):
43 | ptrs = defaultdict(list)
44 | for k, v in tensors.items():
45 | ptrs[v.data_ptr()].append(k)
46 | failing = []
47 | for ptr, names in ptrs.items():
48 | if len(names) > 1:
49 | failing.append(names)
50 | return failing
51 |
52 |
53 | def check_file_size(sf_filename: str, pt_filename: str):
54 | sf_size = os.stat(sf_filename).st_size
55 | pt_size = os.stat(pt_filename).st_size
56 |
57 | if (sf_size - pt_size) / pt_size > 0.01:
58 | raise RuntimeError(
59 | f"""The file size different is more than 1%:
60 | - {sf_filename}: {sf_size}
61 | - {pt_filename}: {pt_size}
62 | """
63 | )
64 |
65 |
66 | def rename(pt_filename: str) -> str:
67 | filename, ext = os.path.splitext(pt_filename)
68 | local = f"{filename}.safetensors"
69 | local = local.replace("pytorch_model", "model")
70 | return local
71 |
72 |
73 | def convert_multi(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
74 | filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json", token=token, cache_dir=folder)
75 | with open(filename, "r") as f:
76 | data = json.load(f)
77 |
78 | filenames = set(data["weight_map"].values())
79 | local_filenames = []
80 | for filename in filenames:
81 | pt_filename = hf_hub_download(repo_id=model_id, filename=filename, token=token, cache_dir=folder)
82 |
83 | sf_filename = rename(pt_filename)
84 | sf_filename = os.path.join(folder, sf_filename)
85 | convert_file(pt_filename, sf_filename)
86 | local_filenames.append(sf_filename)
87 |
88 | index = os.path.join(folder, "model.safetensors.index.json")
89 | with open(index, "w") as f:
90 | newdata = {k: v for k, v in data.items()}
91 | newmap = {k: rename(v) for k, v in data["weight_map"].items()}
92 | newdata["weight_map"] = newmap
93 | json.dump(newdata, f, indent=4)
94 | local_filenames.append(index)
95 |
96 | operations = [
97 | CommitOperationAdd(path_in_repo=local.split("/")[-1], path_or_fileobj=local) for local in local_filenames
98 | ]
99 | errors: List[Tuple[str, "Exception"]] = []
100 |
101 | return operations, errors
102 |
103 |
104 | def convert_single(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
105 | pt_filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin", token=token, cache_dir=folder)
106 |
107 | sf_name = "model.safetensors"
108 | sf_filename = os.path.join(folder, sf_name)
109 | convert_file(pt_filename, sf_filename)
110 | operations = [CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=sf_filename)]
111 | errors: List[Tuple[str, "Exception"]] = []
112 | return operations, errors
113 |
114 |
115 | def convert_file(
116 | pt_filename: str,
117 | sf_filename: str,
118 | ):
119 | loaded = torch.load(pt_filename, map_location="cpu")
120 | if "state_dict" in loaded:
121 | loaded = loaded["state_dict"]
122 | shared = shared_pointers(loaded)
123 | for shared_weights in shared:
124 | for name in shared_weights[1:]:
125 | loaded.pop(name)
126 |
127 | # For tensors to be contiguous
128 | loaded = {k: v.contiguous() for k, v in loaded.items()}
129 |
130 | dirname = os.path.dirname(sf_filename)
131 | os.makedirs(dirname, exist_ok=True)
132 | save_file(loaded, sf_filename, metadata={"format": "pt"})
133 | check_file_size(sf_filename, pt_filename)
134 | reloaded = load_file(sf_filename)
135 | for k in loaded:
136 | pt_tensor = loaded[k]
137 | sf_tensor = reloaded[k]
138 | if not torch.equal(pt_tensor, sf_tensor):
139 | raise RuntimeError(f"The output tensors do not match for key {k}")
140 |
141 |
142 | def create_diff(pt_infos: Dict[str, List[str]], sf_infos: Dict[str, List[str]]) -> str:
143 | errors = []
144 | for key in ["missing_keys", "mismatched_keys", "unexpected_keys"]:
145 | pt_set = set(pt_infos[key])
146 | sf_set = set(sf_infos[key])
147 |
148 | pt_only = pt_set - sf_set
149 | sf_only = sf_set - pt_set
150 |
151 | if pt_only:
152 | errors.append(f"{key} : PT warnings contain {pt_only} which are not present in SF warnings")
153 | if sf_only:
154 | errors.append(f"{key} : SF warnings contain {sf_only} which are not present in PT warnings")
155 | return "\n".join(errors)
156 |
157 |
158 | def check_final_model(model_id: str, folder: str, token: Optional[str]):
159 | config = hf_hub_download(repo_id=model_id, filename="config.json", token=token, cache_dir=folder)
160 | shutil.copy(config, os.path.join(folder, "config.json"))
161 | config = AutoConfig.from_pretrained(folder)
162 |
163 | import transformers
164 |
165 | class_ = getattr(transformers, config.architectures[0])
166 | with torch.device("meta"):
167 | (pt_model, pt_infos) = class_.from_pretrained(folder, output_loading_info=True)
168 | (sf_model, sf_infos) = class_.from_pretrained(folder, output_loading_info=True)
169 |
170 | if pt_infos != sf_infos:
171 | error_string = create_diff(pt_infos, sf_infos)
172 | raise ValueError(f"Different infos when reloading the model: {error_string}")
173 |
174 | #### XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
175 | #### SKIPPING THE REST OF THE test to save RAM
176 | return
177 | pt_params = pt_model.state_dict()
178 | sf_params = sf_model.state_dict()
179 |
180 | pt_shared = shared_pointers(pt_params)
181 | sf_shared = shared_pointers(sf_params)
182 | if pt_shared != sf_shared:
183 | raise RuntimeError("The reconstructed model is wrong, shared tensors are different {shared_pt} != {shared_tf}")
184 |
185 | sig = signature(pt_model.forward)
186 | input_ids = torch.arange(10).unsqueeze(0)
187 | pixel_values = torch.randn(1, 3, 224, 224)
188 | input_values = torch.arange(1000).float().unsqueeze(0)
189 | # Hardcoded for whisper basically
190 | input_features = torch.zeros((1, 80, 3000))
191 | kwargs = {}
192 | if "input_ids" in sig.parameters:
193 | kwargs["input_ids"] = input_ids
194 | if "input_features" in sig.parameters:
195 | kwargs["input_features"] = input_features
196 | if "decoder_input_ids" in sig.parameters:
197 | kwargs["decoder_input_ids"] = input_ids
198 | if "pixel_values" in sig.parameters:
199 | kwargs["pixel_values"] = pixel_values
200 | if "input_values" in sig.parameters:
201 | kwargs["input_values"] = input_values
202 | if "bbox" in sig.parameters:
203 | kwargs["bbox"] = torch.zeros((1, 10, 4)).long()
204 | if "image" in sig.parameters:
205 | kwargs["image"] = pixel_values
206 |
207 | if torch.cuda.is_available():
208 | pt_model = pt_model.cuda()
209 | sf_model = sf_model.cuda()
210 | kwargs = {k: v.cuda() for k, v in kwargs.items()}
211 |
212 | try:
213 | pt_logits = pt_model(**kwargs)[0]
214 | except Exception as e:
215 | try:
216 | # Musicgen special exception.
217 | decoder_input_ids = torch.ones((input_ids.shape[0] * pt_model.decoder.num_codebooks, 1), dtype=torch.long)
218 | if torch.cuda.is_available():
219 | decoder_input_ids = decoder_input_ids.cuda()
220 |
221 | kwargs["decoder_input_ids"] = decoder_input_ids
222 | pt_logits = pt_model(**kwargs)[0]
223 | except Exception:
224 | raise e
225 | sf_logits = sf_model(**kwargs)[0]
226 |
227 | torch.testing.assert_close(sf_logits, pt_logits)
228 | print(f"Model {model_id} is ok !")
229 |
230 |
231 | def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
232 | try:
233 | main_commit = api.list_repo_commits(model_id)[0].commit_id
234 | discussions = api.get_repo_discussions(repo_id=model_id)
235 | except Exception:
236 | return None
237 | for discussion in discussions:
238 | if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
239 | commits = api.list_repo_commits(model_id, revision=discussion.git_reference)
240 |
241 | if main_commit == commits[1].commit_id:
242 | return discussion
243 | return None
244 |
245 |
246 | def convert_generic(model_id: str, folder: str, filenames: Set[str], token: Optional[str]) -> ConversionResult:
247 | operations = []
248 | errors = []
249 |
250 | extensions = set([".bin", ".ckpt"])
251 | for filename in filenames:
252 | prefix, ext = os.path.splitext(filename)
253 | if ext in extensions:
254 | pt_filename = hf_hub_download(model_id, filename=filename, token=token, cache_dir=folder)
255 | dirname, raw_filename = os.path.split(filename)
256 | if raw_filename == "pytorch_model.bin":
257 | # XXX: This is a special case to handle `transformers` and the
258 | # `transformers` part of the model which is actually loaded by `transformers`.
259 | sf_in_repo = os.path.join(dirname, "model.safetensors")
260 | else:
261 | sf_in_repo = f"{prefix}.safetensors"
262 | sf_filename = os.path.join(folder, sf_in_repo)
263 | try:
264 | convert_file(pt_filename, sf_filename)
265 | operations.append(CommitOperationAdd(path_in_repo=sf_in_repo, path_or_fileobj=sf_filename))
266 | except Exception as e:
267 | errors.append((pt_filename, e))
268 | return operations, errors
269 |
270 |
271 | def convert(api: "HfApi", model_id: str, force: bool = False) -> Tuple["CommitInfo", List[Tuple[str, "Exception"]]]:
272 | pr_title = "Adding `safetensors` variant of this model"
273 | info = api.model_info(model_id)
274 | filenames = set(s.rfilename for s in info.siblings)
275 |
276 | with TemporaryDirectory() as d:
277 | folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
278 | os.makedirs(folder)
279 | new_pr = None
280 | try:
281 | operations = None
282 | pr = previous_pr(api, model_id, pr_title)
283 |
284 | library_name = getattr(info, "library_name", None)
285 | if any(filename.endswith(".safetensors") for filename in filenames) and not force:
286 | raise AlreadyExists(f"Model {model_id} is already converted, skipping..")
287 | elif pr is not None and not force:
288 | url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
289 | new_pr = pr
290 | raise AlreadyExists(f"Model {model_id} already has an open PR check out {url}")
291 | elif library_name == "transformers":
292 | if "pytorch_model.bin" in filenames:
293 | operations, errors = convert_single(model_id, folder, token=api.token)
294 | elif "pytorch_model.bin.index.json" in filenames:
295 | operations, errors = convert_multi(model_id, folder, token=api.token)
296 | else:
297 | raise RuntimeError(f"Model {model_id} doesn't seem to be a valid pytorch model. Cannot convert")
298 | check_final_model(model_id, folder, token=api.token)
299 | else:
300 | operations, errors = convert_generic(model_id, folder, filenames, token=api.token)
301 |
302 | if operations:
303 | new_pr = api.create_commit(
304 | repo_id=model_id,
305 | operations=operations,
306 | commit_message=pr_title,
307 | commit_description=COMMIT_DESCRIPTION,
308 | create_pr=True,
309 | )
310 | print(f"Pr created at {new_pr.pr_url}")
311 | else:
312 | print("No files to convert")
313 | finally:
314 | shutil.rmtree(folder)
315 | return new_pr, errors
316 |
317 |
318 | if __name__ == "__main__":
319 | DESCRIPTION = """
320 | Simple utility tool to convert automatically some weights on the hub to `safetensors` format.
321 | It is PyTorch exclusive for now.
322 | It works by downloading the weights (PT), converting them locally, and uploading them back
323 | as a PR on the hub.
324 | """
325 | parser = argparse.ArgumentParser(description=DESCRIPTION)
326 | parser.add_argument(
327 | "model_id",
328 | type=str,
329 | help="The name of the model on the hub to convert. E.g. `gpt2` or `facebook/wav2vec2-base-960h`",
330 | )
331 | parser.add_argument(
332 | "--force",
333 | action="store_true",
334 | help="Create the PR even if it already exists of if the model was already converted.",
335 | )
336 | parser.add_argument(
337 | "-y",
338 | action="store_true",
339 | help="Ignore safety prompt",
340 | )
341 | args = parser.parse_args()
342 | model_id = args.model_id
343 | api = HfApi()
344 | if args.y:
345 | txt = "y"
346 | else:
347 | txt = input(
348 | "This conversion script will unpickle a pickled file, which is inherently unsafe. If you do not trust this file, we invite you to use"
349 | " https://huggingface.co/spaces/safetensors/convert or google colab or other hosted solution to avoid potential issues with this file."
350 | " Continue [Y/n] ?"
351 | )
352 | if txt.lower() in {"", "y"}:
353 | try:
354 | commit_info, errors = convert(api, model_id, force=args.force)
355 | string = f"""
356 | ### Success 🔥
357 | Yay! This model was successfully converted and a PR was open using your token, here:
358 | [{commit_info.pr_url}]({commit_info.pr_url})
359 | """
360 | if errors:
361 | string += "\nErrors during conversion:\n"
362 | string += "\n".join(
363 | f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors
364 | )
365 | print(string)
366 | except Exception as e:
367 | print(
368 | f"""
369 | ### Error 😢😢😢
370 |
371 | {e}
372 | """
373 | )
374 | else:
375 | print(f"Answer was `{txt}` aborting.")
--------------------------------------------------------------------------------
/finetuning/utils/prompter.py:
--------------------------------------------------------------------------------
1 | """
2 | A dedicated helper to manage templates and prompt building.
3 |
4 | Code adapted from the alpaca-lora repository at https://github.com/tloen/alpaca-lora/blob/main/utils/prompter.py
5 | """
6 |
7 | import json
8 | import os.path as osp
9 | from typing import Union
10 |
11 |
12 | class Prompter(object):
13 |
14 | __slots__ = ("template", "_verbose", "pctile_test")
15 |
16 | def __init__(self, template_name: str = "", verbose: bool = False):
17 | self._verbose = verbose
18 | self.pctile_test = False
19 | if template_name == "code_opt_w_speedup_pctile_test":
20 | self.pctile_test = True
21 | template_name = "code_opt_w_speedup_pctile"
22 | if not template_name:
23 | # Enforce the default here, so the constructor can be called with '' and will not break.
24 | template_name = "code_opt"
25 | file_name = osp.join("templates", f"{template_name}.json")
26 | if not osp.exists(file_name):
27 | raise ValueError(f"Can't read {file_name}")
28 | with open(file_name) as fp:
29 | self.template = json.load(fp)
30 | if self._verbose:
31 | print(
32 | f"Using prompt template {template_name}: {self.template['description']}"
33 | )
34 |
35 | print(f"template_name: {template_name}")
36 | print(f"pcitle_test: {self.pctile_test}")
37 |
38 | def generate_prompt(
39 | self,
40 | src_code: str,
41 | tgt_code: Union[None, str] = None,
42 | speedup_desc: Union[None, str] = None,
43 | speedup_bin: Union[None, str] = None,
44 | pctile: Union[None, str] = None,
45 | code_cutoff: int = 1500,
46 | ) -> str:
47 | # returns the full prompt from src_code and optional input
48 | # if a tgt_code (=response, =output) is provided, it's also appended.
49 |
50 | # take first 1500 chars of src_code and tgt_code to make sure the prompt is not too long
51 | src_code = src_code[:code_cutoff]
52 |
53 | if speedup_desc and speedup_bin:
54 | raise ValueError("Both speedup_desc and speedup_bin can mot be set.")
55 |
56 | if tgt_code:
57 | tgt_code = tgt_code[:code_cutoff]
58 |
59 | if speedup_desc:
60 | try:
61 | res = self.template["prompt_no_input"].format(
62 | src_code=src_code,
63 | speedup_desc=speedup_desc
64 | )
65 | except Exception as e:
66 | print("Oops! There is no speedup_desc in the template prompt!")
67 | elif speedup_bin:
68 | try:
69 | res = self.template["prompt_no_input"].format(
70 | src_code=src_code,
71 | speedup_bin=speedup_bin
72 | )
73 | except Exception as e:
74 | print("Oops! There is no speedup_bin in the template prompt!")
75 | elif pctile:
76 | try:
77 | res = self.template["prompt_no_input"].format(
78 | src_code=src_code,
79 | pctile=pctile
80 | )
81 | except Exception as e:
82 | print("Oops! There is no pctile in the template prompt!")
83 | elif self.pctile_test: # test time
84 | try:
85 | res = self.template["prompt_no_input"].format(
86 | src_code=src_code,
87 | pctile="10"
88 | )
89 | except Exception as e:
90 | print("Oops! There is no pctile in the template prompt!")
91 | else: # only src_code
92 | try:
93 | res = self.template["prompt_no_input"].format(
94 | src_code=src_code
95 | )
96 | except Exception as e:
97 | print("Oops! There is no src_code in the template prompt!")
98 |
99 | if tgt_code:
100 | res = f"{res}{tgt_code}"
101 |
102 | if self._verbose:
103 | print(res)
104 | return res
105 |
106 | def get_response(self, output: str) -> str:
107 | return output.split(self.template["response_split"])[1].strip()
108 |
--------------------------------------------------------------------------------
/gem5/README.md:
--------------------------------------------------------------------------------
1 | # Gem5 Simulator for PIE
2 |
3 | ## Overview
4 |
5 | This subdirectory contains the `gem5` module, which we use to interface with the `gem5` simulator. The `gem5` simulator is a full systema and CPU simulator that can be used to simulate the execution of a program on a computer system. We use `gem5` to simulate the execution of the programs in a determinstic and reproducible manner.
6 |
7 | For our experiments, we use a simulated CPU of the Intel Skylake CPU.
8 | We provide an easy-to-use docker image and API that can be used to reproduce our results and for other researchers to continue to use for program optimization research.
9 |
10 | Building the environment is similar to the [gym](https://github.com/Farama-Foundation/Gymnasium) API for reinforcement learning. After importing the module and running make, the docker image should automatically be pulled on the first iteration and a container created. The environment then provides a convenient abstraction for interacting with the environment.
11 |
12 | Results from our experiments can be located in [this google drive folder](https://drive.google.com/drive/folders/1criq4bpLlIaINzhjUAB18NZwDtEkk0Rj?usp=sharing).
13 |
14 |
15 |
16 | ## Usage
17 | \***********************************************************************************************************************************
18 |
19 | **Note that in order to use the module and its container for simulation, your architecture will need to be either x86-64 or Amd64**
20 |
21 | \***********************************************************************************************************************************
22 |
23 | First you need to configure the pie project as part of your python path. You can do this by running the following command from the root of the pie project:
24 |
25 | ```bash
26 | export PYTHONPATH=$PYTHONPATH:$(pwd)
27 | ```
28 |
29 | On your system you will need to have docker installed. The module works using the Docker Python SDK and is designed to abstract away all the hassle of pulling the container and configuring the gem5 simulator. We have designed it to reflect the OpenAI Gym API, so it should be easy to use for anyone familiar with that.
30 |
31 | ```python
32 |
33 | from gem5 import simulator
34 | env = simulator.make(...)
35 | results = env.submit_multiple_single_submissions(...)
36 |
37 | ```
38 |
39 | In order to get started you will need the simulator.make() function to create an environment object which you can then use to submit to the simulator backend.
40 |
41 | #### Key Arguments for simulator.make()
42 |
43 | - `arch`: The architecture to use. Currently only 'X86-skylake' is supported.
44 | - `cpuset_cpus`: The cpus to use. If not specified, all cpus are used.
45 | - `workers`: The number of workers to use. If not specified, all cpus are used.
46 | - `gem5_acc_threshold`: If the functional accuracy is below this threshold, we skip any benchmarking and return the result early.
47 | - `port`: The port to use for communication.
48 | - `optimization_flag`: The GCC optimization flag to use for compilation, for our work we used '-O3'.
49 | - `cpu_type`: The type of CPU configuration to use. For our work we used 'Verbatim' from the skylake configuration used.
50 | - `timeout_seconds_gem5`: The timeout in seconds for the gem5 simulator, for our work we used 120 seconds for evaluation.
51 | - `verbose`: We highly recommend setting this to True to monitor the progress of the gem5 simulator.
52 | - `exit_early_on_fail`: If True, we exit early if any individual test case times out or encounters a runtime error, we highly recommend this to be set to True for speeding things up if you're only evaluating, as we that would not contribute to any speedups.
53 |
54 | #### Key Arguments for env.submit_multiple_single_submissions()
55 |
56 | - `code_list`: A list of strings, each string is the code of a single submission.
57 | - `testcases_list`: Each sublist consists of the test cases used for benchmarking the corresponding code: these are the integer indices of the test cases in the test case pool.
58 | - `problem_id_list`: A list of strings, each string is the problem id for the corresponding code.
59 | - `timing_env`: The timing environment to use: currently only 'gem5' is supported, we have prototype support for hardware based benchmarking on your machine using 'hyperfine' or 'both' but the 'hyperfine' support is not fully implemented yet.
60 |
61 | ## Evaluation Script
62 |
63 | The evaluation driver is located in `gem5/gem5_eval.py`. This script requires a yaml configuration file to be passed in as an argument to `--config_path`. Example usage from the project directory would be:
64 |
65 | ```bash
66 | export PYTHONPATH=$PYTHONPATH:$(pwd)
67 | python gem5/gem5_eval.py --config_path PATH_TO_EXPERIMENT_CONFIG.yaml
68 | ```
69 |
70 | The yaml configuration file should contain at least the following fields:
71 |
72 | - `model_generated_outputs_path`: The path to the model generated outputs. This should be a `.jsonl` file containing the model generated outputs in addition to all other metadata in the test set file.
73 | - `output_dir`: The directory to output the results to.
74 | - `reference_file_path`: The path to the reference file. This should be the reference `.jsonl` file containing the reference outputs in addition to all other metadata in the test set file.
75 | - `model_generated_potentially_faster_code_col`: The column in the model generated outputs that contains the model's generations of potentially faster code. We've used "generated_answers" as a default.
76 |
77 | An example is provided in [gem5/template_config.yaml](template_config.yaml).
78 |
--------------------------------------------------------------------------------
/gem5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/gem5/__init__.py
--------------------------------------------------------------------------------
/gem5/api_pytest.py:
--------------------------------------------------------------------------------
1 | import benchmarking
2 | import tempfile
3 | import subprocess
4 | import os
5 | import glob
6 | import numpy as np
7 | from tqdm import tqdm
8 | from collections import defaultdict
9 |
10 | count_to_10_cpp = """
11 | #include
12 | using namespace std;
13 |
14 | int main() {
15 | for (int i = 0; i < 10; i++) {
16 | cout << i << endl;
17 | }
18 | return 0;
19 | }
20 | """
21 |
22 | mult_in_by_2_cpp = """
23 | #include
24 | using namespace std;
25 |
26 | int main() {
27 | int x;
28 | cin >> x;
29 | cout << x * 2 << endl;
30 | return 0;
31 | }
32 | """
33 |
34 | example_1_code = """
35 | #include
36 | #define REP(i, n) for (int i = 0; i < (n); i++)
37 | using namespace std;
38 | const int MOD = 998244353;
39 |
40 | int main() {
41 | cin.tie(0)->sync_with_stdio(false);
42 |
43 | int n, k; cin >> n >> k;
44 | vector l(k), r(k);
45 | REP(i, k) cin >> l[i] >> r[i];
46 | REP(i, k) r[i]++;
47 |
48 | vector dp(n + 1, 0);
49 | dp[0] = 1;
50 | dp[1] = -1;
51 | REP(i, n) {
52 | if (i > 0)
53 | dp[i] = (dp[i] + dp[i - 1]) % MOD;
54 | REP(j, k) {
55 | if (i + l[j] < n)
56 | dp[i + l[j]] = (dp[i + l[j]] + dp[i]) % MOD;
57 | if (i + r[j] < n)
58 | dp[i + r[j]] = (((dp[i + r[j]] - dp[i]) % MOD) + MOD) % MOD;
59 | }
60 | }
61 | cout << dp[n - 1] << endl;
62 | return 0;
63 | }
64 | """
65 | example_1_problem_id = "p02549"
66 |
67 | example_hello_world_code = """
68 | #include
69 |
70 | int main() {
71 | std::cout << "Hello, World!" << std::endl;
72 | return 0;
73 | }
74 | """
75 |
76 | # def exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout):
77 | # logging.info(f'executing {bin_path}, with input {in_path}')
78 | # with open(in_path, 'r') as fh:
79 | # p = subprocess.run([bin_path], capture_output=True, timeout=timeout, stdin=fh, text=True)
80 | # if p.returncode != 0:
81 | # raise Exception(f"Error executing code: {bin_path}, return code: {p.returncode}, stderr: {p.stderr.decode('utf-8')}")
82 | # else:
83 | # return get_accuracy(p.stdout, ground_truth_output)
84 |
85 |
86 | class TestBenchmarking:
87 | def test_compile(self):
88 | with tempfile.TemporaryDirectory() as tmpdir:
89 | code_path = os.path.join(tmpdir, "basic.cpp")
90 | with open(code_path, "w") as f:
91 | f.write(count_to_10_cpp)
92 | output_path = benchmarking.compile_cpp_code(code_path)
93 | p = subprocess.run([output_path], capture_output=True, text=True)
94 | assert p.returncode == 0
95 | assert p.stdout.strip() == "\n".join([str(i) for i in range(10)])
96 | assert os.path.exists(output_path)
97 | assert os.path.getsize(output_path) > 0
98 |
99 | def test_exec_bin(self):
100 | with tempfile.TemporaryDirectory() as tmpdir:
101 | code_path = os.path.join(tmpdir, "basic.cpp")
102 | with open(code_path, "w") as f:
103 | f.write(count_to_10_cpp)
104 | output_path = benchmarking.compile_cpp_code(code_path)
105 | rc, stdout, stderr = benchmarking.exec_bin(output_path, None, None)
106 | assert rc == 0
107 | assert stdout.strip() == "\n".join([str(i) for i in range(10)])
108 | assert stderr == ""
109 |
110 | def test_exec_bin_input(self):
111 | with tempfile.TemporaryDirectory() as tmpdir:
112 | code_path = os.path.join(tmpdir, "basic.cpp")
113 | input_path = os.path.join(tmpdir, "input.txt")
114 | with open(code_path, "w") as f:
115 | f.write(mult_in_by_2_cpp)
116 | with open(input_path, "w") as f:
117 | f.write("2")
118 | output_path = benchmarking.compile_cpp_code(code_path)
119 | rc, stdout, stderr = benchmarking.exec_bin(output_path, input_path, None)
120 | assert rc == 0
121 | assert stdout.strip() == "4"
122 | assert stderr == ""
123 |
124 | def test_exec_bin_for_acc(self):
125 | with tempfile.TemporaryDirectory() as tmpdir:
126 | code_path = os.path.join(tmpdir, "basic.cpp")
127 | input_path = os.path.join(tmpdir, "input.txt")
128 | with open(code_path, "w") as f:
129 | f.write(mult_in_by_2_cpp)
130 | with open(input_path, "w") as f:
131 | f.write("2")
132 | output_path = benchmarking.compile_cpp_code(code_path)
133 | acc_correct = benchmarking.exec_bin_for_acc(output_path, input_path, "4", None)
134 | acc_incorrect = benchmarking.exec_bin_for_acc(output_path, input_path, "5", None)
135 | assert acc_correct == 1
136 | assert acc_incorrect == 0
137 |
138 | def test_compile_and_check_outputs(self):
139 | with tempfile.TemporaryDirectory() as tempdir:
140 | code_path = os.path.join(tempdir, "basic.cpp")
141 | with open(code_path, "w") as fh:
142 | fh.write(example_1_code)
143 | bin_path, accs = benchmarking.compile_and_check_outputs(
144 | code_path=code_path,
145 | problem_id=example_1_problem_id,
146 | testcases_dir="/home/pie-perf/data/codenet/merged_test_cases/"
147 | )
148 | print(f"bin_path: {bin_path}")
149 | assert os.path.exists(bin_path)
150 | assert os.path.getsize(bin_path) > 0
151 | assert np.mean(list(accs.values())) == 1.0
152 | assert np.std(list(accs.values())) == 0.0
153 | n_testcases = len(glob.glob(os.path.join("/home/pie-perf/data/codenet/merged_test_cases/", example_1_problem_id, "input.*.txt")))
154 | assert len(accs) == n_testcases
155 |
156 | def test_exec_gem5(self):
157 | sim_seconds = []
158 | sim_seconds_precise = []
159 | for _ in tqdm(range(5)):
160 | with tempfile.TemporaryDirectory() as tmpdir:
161 | code_path = os.path.join(tmpdir, "basic.cpp")
162 | with open(code_path, "w") as f:
163 | f.write(example_hello_world_code)
164 | output_path = benchmarking.compile_cpp_code(code_path, cflags="--std=c++17 -O3")
165 | rc, stdout, stderr = benchmarking.exec_gem5(
166 | gem5_dir="/home/gem5/build/X86/",
167 | gem5_script_path="/home/gem5-skylake-config/gem5-configs/run-se.py",
168 | cpu_type="Verbatim",
169 | bin_path=output_path,
170 | in_path=None,
171 | stats_out_path=os.path.join(tmpdir, "stats.txt"),
172 | timeout=60,
173 | cpu_number=0)
174 |
175 | assert rc == 0
176 | stats = benchmarking.parse_stats_txt(os.path.join(tmpdir, "stats.txt"))
177 | sim_seconds.append(stats["sim_seconds"])
178 | sim_seconds_precise.append(stats["sim_seconds_precise"])
179 | print(f"sim_seconds: {sim_seconds}")
180 | print(f"sim_seconds_precise: {sim_seconds_precise}")
181 | assert np.isclose(np.mean(sim_seconds), 0.001004, atol=1e-5)
182 | assert np.isclose(np.mean(sim_seconds_precise), 0.001004, atol=1e-5)
183 | assert all(sim_seconds_precise[i] == 0.001004121118 for i in range(len(sim_seconds_precise)))
184 |
185 | def test_run_gem5(self):
186 | sim_seconds_0 = []
187 | sim_seconds_1 = []
188 | for _ in tqdm(range(2)):
189 | with tempfile.TemporaryDirectory() as tmpdir:
190 | code_path = os.path.join(tmpdir, "code.cpp")
191 | with open(code_path, "w") as f:
192 | f.write(example_1_code)
193 | bin_path = benchmarking.compile_cpp_code(code_path)
194 | tc_2_results = benchmarking.run_gem5(
195 | gem5_dir="/home/gem5/build/X86/",
196 | gem5_script_path="/home/gem5-skylake-config/gem5-configs/run-se.py",
197 | cpu_type="Verbatim",
198 | bin_path=bin_path,
199 | problem_id=example_1_problem_id,
200 | testcases_dir="/home/pie-perf/data/codenet/merged_test_cases/",
201 | testcases=[0,1],
202 | timeout=30,
203 | cpu_number=0
204 | )
205 | assert tc_2_results[0]["success"] == True
206 | assert tc_2_results[1]["success"] == True
207 | assert len(tc_2_results) == 2
208 | sim_seconds_0.append(tc_2_results[0]["stats"]["sim_seconds_precise"])
209 | sim_seconds_1.append(tc_2_results[1]["stats"]["sim_seconds_precise"])
210 | print(f"sim_seconds for tc 0 {sim_seconds_0}")
211 | print(f"sim_seconds for tc 1 {sim_seconds_1}")
212 | assert sim_seconds_0[0] == sim_seconds_0[1] == 0.001035073468
213 | assert sim_seconds_1[0] == sim_seconds_1[1] == 0.001039205596
214 |
215 |
216 | def test_run_hyperfine(self):
217 | tc2times = defaultdict(list)
218 | for _ in range(2):
219 | with tempfile.TemporaryDirectory() as tmpdir:
220 | code_path = os.path.join(tmpdir, "code.cpp")
221 | with open(code_path, "w") as f:
222 | f.write(example_1_code)
223 | code2results, output = benchmarking.run_hyperfine(
224 | code_paths=[code_path],
225 | problem_ids=[example_1_problem_id],
226 | path_to_testcases="/home/pie-perf/data/codenet/merged_test_cases/",
227 | json_out_path=os.path.join(tmpdir, "results.json"),
228 | test_cases_list=[[i for i in range(10)]],
229 | min_runs_per_test_case=10,
230 | max_runs_per_test_case=500,
231 | strict_runs_per_test_case=False,
232 | warmup_runs_per_test_case=5,
233 | cpu_number=0,
234 | do_sanity_check=True)
235 | for tc, results in code2results[code_path].items():
236 | tc2times[tc].append(np.array(results["times"]))
237 | for tc, times in tc2times.items():
238 | mean_times = []
239 | for time_list in times:
240 | mean_times.append(np.mean(time_list))
241 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
242 | print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
243 | assert len(tc2times) == 10
244 |
245 | def test_run_hyperfine_strict(self):
246 | tc2times = defaultdict(list)
247 | for _ in range(2):
248 | with tempfile.TemporaryDirectory() as tmpdir:
249 | code_path = os.path.join(tmpdir, "code.cpp")
250 | with open(code_path, "w") as f:
251 | f.write(example_1_code)
252 | code2results, output = benchmarking.run_hyperfine(
253 | code_paths=[code_path],
254 | problem_ids=[example_1_problem_id],
255 | path_to_testcases="/home/pie-perf/data/codenet/merged_test_cases/",
256 | json_out_path=os.path.join(tmpdir, "results.json"),
257 | test_cases_list=None,
258 | min_runs_per_test_case=100,
259 | max_runs_per_test_case=None,
260 | strict_runs_per_test_case=True,
261 | warmup_runs_per_test_case=5,
262 | cpu_number=0,
263 | do_sanity_check=True)
264 | for tc, results in code2results[code_path].items():
265 | tc2times[tc].append(np.array(results["times"]))
266 | for tc, times in tc2times.items():
267 | assert len(times) == 2
268 | mean_times = []
269 | for time_list in times:
270 | assert len(time_list) == 100
271 | mean_times.append(np.mean(time_list))
272 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
273 | print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
274 | assert len(tc2times) == len(glob.glob(f"/home/pie-perf/data/codenet/merged_test_cases/{example_1_problem_id}/input*"))
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
--------------------------------------------------------------------------------
/gem5/benchmarking.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pandas as pd
3 | import shutil
4 | import os
5 | import warnings
6 | import traceback
7 | import logging
8 | import subprocess
9 | import glob
10 | import re
11 | import traceback
12 | import time
13 | import shlex
14 | from typing import Optional, List, Tuple, Dict, Any, Union
15 | import multiprocessing
16 | from collections import defaultdict
17 | import json
18 | import resource
19 | import re
20 | import ast
21 | from dataclasses import dataclass
22 |
23 | logging.basicConfig(level=logging.DEBUG)
24 | logging.getLogger("resource").setLevel(logging.DEBUG)
25 |
26 | MAX_VIRTUAL_MEMORY = 10 * 1024 * 1024 * 50 # 500 MB
27 |
28 | # from https://gist.github.com/s3rvac/f97d6cbdfdb15c0a32e7e941f7f4a3fa
29 | def limit_virtual_memory():
30 | resource.setrlimit(resource.RLIMIT_AS, (MAX_VIRTUAL_MEMORY, MAX_VIRTUAL_MEMORY * 10))
31 |
32 |
33 | def get_accuracy(output: str, ground_truth: str) -> float:
34 | """
35 | Compare the output of the code with the ground truth.
36 | """
37 | num_correct = 0
38 | ground_truth_lines = ground_truth.strip().splitlines()
39 | output_truth_lines = output.strip().splitlines()
40 | for gen_output, ground_truth_output in zip(output_truth_lines, ground_truth_lines):
41 | is_corr = gen_output == ground_truth_output
42 | if not is_corr:
43 | try:
44 | gen_output = float(gen_output)
45 | ground_truth_output = float(ground_truth_output)
46 | is_corr = abs(gen_output - ground_truth_output) < 1e-3
47 | except:
48 | pass
49 | num_correct += int(is_corr)
50 |
51 | return num_correct / len(ground_truth_lines)
52 |
53 | def compile_cpp_code(code_path: str, timeout: int = 30, output_path: str = None, cflags: str = "--std=c++17 -O3", cpu_number: Optional[int] = None) -> str:
54 | """_summary_
55 |
56 | Args:
57 | code_path (str): _description_
58 | output_path (str, optional): _description_
59 | cflags (str, optional): _description_
60 |
61 | Returns:
62 | str: _description_
63 | """
64 | if output_path is None:
65 | output_path = os.path.join(os.path.dirname(code_path), f"{os.path.splitext(os.path.basename(code_path))[0]}.out")
66 | cpu_cmd = f"taskset --cpu-list {cpu_number}" if cpu_number is not None else ""
67 |
68 | cmd = shlex.split(cpu_cmd) + ["/usr/bin/g++", code_path, "-o", output_path] + shlex.split(cflags.replace('"', "").replace("'", ""))
69 | logging.critical(f"Running command: {' '.join(cmd)}")
70 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
71 | if p.returncode != 0:
72 | raise Exception(f"Error compiling code: {code_path} with command: {' '.join(cmd)}, return code: {p.returncode}, stderr: {p.stderr}")
73 | else:
74 | # sometimes there can be latency in the file system, so we wait a bit
75 | while(not os.path.exists(output_path)):
76 | time.sleep(0.05)
77 | return output_path
78 |
79 | def exec_bin(bin_path, in_path, timeout, cpu_number=None):
80 | logging.info(f'executing {bin_path}, with input {in_path}')
81 | if in_path is not None:
82 | fh = open(in_path, 'r')
83 | else:
84 | fh = subprocess.DEVNULL
85 | cmd = [bin_path]
86 | if cpu_number is not None:
87 | cmd = ["taskset", "--cpu-list", str(cpu_number)] + cmd
88 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, stdin=fh, text=True)
89 | if in_path is not None:
90 | fh.close()
91 | return p.returncode, p.stdout, p.stderr
92 |
93 | def exec_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, in_path, stats_out_path, timeout: str = None, cpu_number=None):
94 | gem5_bin = os.path.join(gem5_dir, 'gem5.opt')
95 | cmd = shlex.split(f"{gem5_bin} --stats-file={stats_out_path} {gem5_script_path} {cpu_type} {bin_path}")
96 | if cpu_number is not None:
97 | cmd = ["taskset", "--cpu-list", str(cpu_number)] + cmd
98 | if in_path is not None:
99 | logging.info(f'executing {" ".join(cmd)}, with input {in_path}')
100 | with open(in_path, 'r') as fh:
101 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, stdin=fh, text=True)
102 | else:
103 | logging.info(f'executing {" ".join(cmd)}, with no input')
104 | p = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
105 | return p.returncode, p.stdout, p.stderr
106 |
107 | def exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout=None):
108 | logging.info(f'executing {bin_path}, with input {in_path}')
109 | with open(in_path, 'r') as fh:
110 | p = subprocess.run([bin_path], capture_output=True, timeout=timeout, stdin=fh, text=True)
111 | if p.returncode != 0:
112 | raise Exception(f"Error executing code: {bin_path}, return code: {p.returncode}, stderr: {p.stderr.decode('utf-8')}")
113 | else:
114 | return get_accuracy(p.stdout, ground_truth_output)
115 |
116 | def compile_and_check_outputs(code_path, problem_id, testcases_dir, timeout=None, cflags: str ="--std=c++17 -O3", testcases: List[int] = None, cpu_number=None):
117 |
118 | input_output_pairs = {}
119 | input_paths = glob.glob(os.path.join(testcases_dir, problem_id, f"input.*.txt"))
120 | for in_path in input_paths:
121 | tc_no = re.search(r"input\.(\d+)\.txt", in_path).group(1)
122 | if testcases is not None and int(tc_no) not in testcases and tc_no not in testcases: # allow both int and str
123 | continue
124 | out_path = os.path.join(testcases_dir, problem_id, f"output.{tc_no}.txt")
125 | input_output_pairs[tc_no] = (in_path, out_path)
126 | logging.info(f"Found {len(input_output_pairs)} testcases for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
127 | try:
128 | bin_path = compile_cpp_code(code_path, timeout, cflags=cflags, cpu_number=cpu_number)
129 | logging.info(f"Compiled {code_path} to {bin_path}")
130 | except Exception as e:
131 | return None, {tc_no: 0 for tc_no in input_output_pairs.keys()}
132 |
133 | accs = {}
134 |
135 | for tc_no, (in_path, out_path) in input_output_pairs.items():
136 | with open(out_path, 'r') as fh:
137 | ground_truth_output = fh.read().strip()
138 | try:
139 | acc = exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout)
140 | accs[tc_no] = acc
141 | except Exception as e:
142 | logging.error(f"Error executing code: {bin_path} with input: {in_path}, error: {e}")
143 | accs[tc_no] = 0
144 |
145 | logging.info(f"bin_path: {bin_path}, accs: {accs}")
146 |
147 | return bin_path, accs
148 |
149 | def compile_and_check_outputs_multi(
150 | code_paths,
151 | problem_ids,
152 | testcases_dir,
153 | timeout=None,
154 | cflags: str ="--std=c++17 -O3",
155 | test_cases_list = None,
156 | cpu_number=None):
157 | if test_cases_list is None:
158 | test_cases_list = [None for _ in range(len(code_paths))]
159 | code2results = defaultdict(dict)
160 | for code_path, problem_id, test_cases in zip(code_paths, problem_ids, test_cases_list):
161 | bin_path, accs = compile_and_check_outputs(code_path, problem_id, testcases_dir, timeout, cflags, test_cases, cpu_number)
162 | code2results[code_path]["compile_success"] = bin_path is not None
163 | code2results[code_path]["bin_path"] = bin_path
164 | code2results[code_path]["accs"] = accs
165 | return code2results
166 |
167 |
168 | def calc_sim_seconds(stats):
169 | return float(stats["sim_ticks"]) / float(stats["sim_freq"]) # more accurate than sim_seconds
170 |
171 |
172 | def parse_stats_txt(stats_path):
173 | with open(stats_path, 'r') as f:
174 | stats_lines = f.readlines()
175 |
176 | stats = {}
177 | for line in stats_lines:
178 | if line.strip() == '':
179 | continue
180 | if "Begin" in line:
181 | continue
182 | if "End" in line:
183 | continue
184 | line = re.sub("#.*", "", line).strip() # remove comments
185 | parts = line.split()
186 | parts = [part.strip() for part in parts]
187 | if len(parts) > 2:
188 | value = parts[1:]
189 | elif len(parts) == 2:
190 | value = parts[1]
191 | else:
192 | logging.warn(f'could not parse line {line}')
193 | continue
194 | key = parts[0]
195 | if isinstance(value, str):
196 | try:
197 | value = value.replace("%", "").replace("nan", "None").replace("inf", "None").replace("-inf", "None")
198 | value = ast.literal_eval(value) if value != "None" else None
199 | except:
200 | logging.warn(f"could not parse value {value} for key {key}")
201 | elif isinstance(value, list):
202 | try:
203 | value = [v.replace("%", "").replace("nan", "None").replace("inf", "None").replace("-inf", "None") for v in value]
204 | value = [ast.literal_eval(v) if v != "None" else None for v in value]
205 | except:
206 | logging.warn(f"could not parse value {value} for key {key}")
207 | stats[key] = value
208 | stats["sim_seconds_precise"] = calc_sim_seconds(stats)
209 | return stats
210 |
211 |
212 | def run_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, problem_id, testcases_dir, timeout, testcases: List[int] = None, cpu_number=None, exit_early_on_fail=True):
213 | input_paths = glob.glob(os.path.join(testcases_dir, problem_id, f"input.*.txt"))
214 | tc_2_in_path = {}
215 | logging.info(f"Found {len(input_paths)} total testcases for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
216 | for in_path in input_paths:
217 | tc_no = int(re.search(r"input\.(\d+)\.txt", in_path).group(1))
218 | if testcases is not None and str(tc_no) not in testcases and tc_no not in testcases:
219 | continue
220 | tc_2_in_path[tc_no] = in_path
221 | logging.info(f"Found {len(tc_2_in_path)} testcases to actually run for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
222 | tc_2_results = {}
223 | any_incorrect_or_timeout = False
224 | logging.critical(f"Running {bin_path} on testcases: {tc_2_in_path.keys()}")
225 | for tc_no, in_path in tc_2_in_path.items():
226 | # logging.critical(f"Running {bin_path} on testcase {tc_no} with input {in_path}")
227 | #### TOOD: MAKE SURE ALL CODE/BINARIES ARE IN UNIQUE DIRECTORIES
228 | stats_out_path = os.path.splitext(bin_path)[0] + f".{tc_no}.txt"
229 | if exit_early_on_fail and any_incorrect_or_timeout:
230 | tc_2_results[tc_no] = {"success": False, "error": "Previous testcase was incorrect or timed out, so skipping this testcase",
231 | "stats": None, "stdout": None, "stderr": None, "time": None}
232 | else:
233 | try:
234 | returncode, stdout, stderr = exec_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, in_path, stats_out_path, timeout, cpu_number=cpu_number)
235 | if returncode != 0:
236 | tc_2_results[tc_no] = {"success": False, "error": f"Error executing code: {bin_path}, return code: {returncode}, stderr: {stderr.decode('utf-8')}",
237 | "stats": None, "stdout": stdout, "stderr": stderr, "time": None}
238 | any_incorrect_or_timeout = True
239 | else:
240 | tc_2_results[tc_no] = {"success": True, "error": None, "stats": parse_stats_txt(stats_out_path), "stdout": stdout, "stderr": stderr, "time": parse_stats_txt(stats_out_path)["sim_seconds_precise"]}
241 | except Exception as e:
242 | traceback_err = traceback.format_exc()
243 | tc_2_results[tc_no] = {"success": False, "error": f"Error executing code: {bin_path}, error: {e}, traceback: {traceback_err}",
244 | "stats": None, "stdout": None, "stderr": None, "time": None}
245 | any_incorrect_or_timeout = True
246 | return tc_2_results
247 |
248 |
249 | def run_gem5_multi(gem5_dir, gem5_script_path, cpu_type, bin_paths, problem_ids, testcases_dir, timeout, test_cases_list: List[int] = None, cpu_number=None, exit_early_on_fail=True):
250 | if test_cases_list is None:
251 | test_cases_list = [None for _ in range(len(bin_paths))]
252 | bin2results = defaultdict(dict)
253 | for bin_path, problem_id, test_cases in zip(bin_paths, problem_ids, test_cases_list):
254 | bin2results[bin_path] = run_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, problem_id, testcases_dir, timeout, test_cases, cpu_number, exit_early_on_fail)
255 | return bin2results
256 |
257 | #### hyperfine
258 |
259 | FSTREAM_HEADER="#include " # for redirecting io
260 |
261 | CPP_HEADERS=[FSTREAM_HEADER]
262 |
263 | def make_redirect_io_cpp(testcase_path, output_path=None):
264 | lines = f"\nstd::ifstream cin(\"{testcase_path}\");\n"
265 | if output_path:
266 | lines = lines + f"std::ofstream cout(\"{output_path}\");\n\n"
267 | return lines
268 |
269 | def add_headers_cpp(code_str):
270 | for header in CPP_HEADERS:
271 | if header not in code_str:
272 | code_str = header + "\n" + code_str
273 | return code_str
274 |
275 |
276 | def insert_io_redirects_cpp(code_str, path_to_testcases, path_to_outputs=None):
277 | import re
278 | ## match all whitespace after main and include that in the match greedy
279 | m = re.search("main(\s*)[^\{}]*{", code_str)
280 | if m is None:
281 | raise ValueError("No main function found")
282 | insert_idx = m.end()
283 | io_redirects = make_redirect_io_cpp(path_to_testcases, path_to_outputs)
284 | return code_str[:insert_idx] + io_redirects + code_str[insert_idx:]
285 |
286 |
287 | def redirect_cpp_io(code_str, path_to_testcases, path_to_outputs=None):
288 | code_str = add_headers_cpp(code_str)
289 | code_str = insert_io_redirects_cpp(code_str, path_to_testcases, path_to_outputs)
290 | return code_str
291 |
292 |
293 | def redirect_cpp_io_file(code_path, stdin_path, stdout_path=None, new_code_dir=None):
294 | input_basename = os.path.splitext(os.path.basename(stdin_path))[0].replace(".", "_")
295 | if new_code_dir is None:
296 | new_code_dir = os.path.dirname(code_path)
297 | if stdout_path is None:
298 | basename = os.path.splitext(os.path.basename(code_path))[0]
299 | stdout_path = os.path.join(new_code_dir, f"{basename}_{input_basename}.stdout")
300 | with open(code_path, "r") as f:
301 | code_str = f.read()
302 | code_str = redirect_cpp_io(code_str, stdin_path, stdout_path)
303 | new_code_path = os.path.join(new_code_dir, f"redirected_{input_basename}_{os.path.basename(code_path)}")
304 | with open(new_code_path, "w") as f:
305 | f.write(code_str)
306 | return new_code_path, stdout_path
307 |
308 |
309 | def redirect_cpp_io_and_compile(code_path, stdin_path, cpu_number=None, new_code_dir=None, stdout_path=None, cflags="--std=c++17 -O3"):
310 | new_code_path, stdout_path = redirect_cpp_io_file(code_path, stdin_path, new_code_dir, stdout_path)
311 | new_binary_path = compile_cpp_code(new_code_path, cpu_number=cpu_number, cflags=cflags)
312 | return new_binary_path, new_code_path, stdout_path
313 |
314 |
315 | ## physical / logical cpu management
316 |
317 | def get_physical_cpu_list():
318 | cmd = " grep -E '^processor|^physical id|^core id' /proc/cpuinfo "
319 | output = os.popen(cmd).read()
320 | output = output.split("processor")
321 | output = [x for x in output if x]
322 | physical2logical = defaultdict(list)
323 | n_logical = 0
324 | for cpu_info in output:
325 | logical_id = re.search("(?<=\t: )\d+", cpu_info).group(0)
326 | physical_id = re.search("(?<=core id\t\t: )\d+", cpu_info).group(0)
327 | physical2logical[int(physical_id)].append(int(logical_id))
328 | n_logical += 1
329 | n_physical = len(physical2logical)
330 | from pprint import pformat
331 | logging.info(f"Physical CPU (n={n_physical}) to Logical CPU (n={n_logical}) mapping:")
332 | logging.info(pformat(sorted(dict(physical2logical).items(), key=lambda x: int(x[0]))))
333 | unique_logical_ids = []
334 | for physical_id, logical_ids in physical2logical.items():
335 | unique_logical_ids.append(logical_ids[0])
336 | logging.info(f"The set of logical ids available for use (n={len(unique_logical_ids)}):")
337 | logging.info(unique_logical_ids)
338 | return unique_logical_ids
339 |
340 | def add_logicial_cpus_to_queue(num_processes, queue):
341 | highest_num_processes = multiprocessing.cpu_count()
342 | if num_processes < 0:
343 | num_processes = highest_num_processes
344 | else:
345 | if num_processes > highest_num_processes:
346 | raise ValueError(f"num_processes {num_processes} is greater than the highest available cpu: {highest_num_processes}.")
347 | available_cpus = list(range(num_processes))
348 | if len(available_cpus) > 2:
349 | available_cpus = available_cpus[:-2]
350 | else:
351 | logging.warning(f"there are fewer than 3 logical CPUs which is not recommended")
352 | for cpu_id in available_cpus:
353 | queue.put(cpu_id)
354 | logging.info(f"List of cpus to be used: {available_cpus}")
355 | return available_cpus
356 |
357 | def add_physical_cpus_to_queue(num_processes, queue):
358 | available_cpus = [i for i in get_physical_cpu_list() if i >= 0]
359 | if len(available_cpus) > 2:
360 | available_cpus = available_cpus[:-2]
361 | else:
362 | logging.warning(f"there are fewer than 3 physical CPUs which is not recommended")
363 | if num_processes < 0:
364 | num_processes = len(available_cpus)
365 | elif len(available_cpus) < num_processes:
366 | raise ValueError(f"Only {len(available_cpus)} available cpus, but {num_processes} processes requested; the set of available cpus is {available_cpus}")
367 | for cpu_id in available_cpus[:num_processes]:
368 | queue.put(cpu_id)
369 | logging.info(f"List of cpus to be used: {available_cpus[:num_processes]}")
370 | return available_cpus
371 |
372 | def run_benchmark(args, json_output_path, timeout_seconds: int = 60) -> Union[str, None]:
373 | try:
374 | logging.info(f"Running {' '.join(args)}")
375 | proc = subprocess.Popen(
376 | args,
377 | preexec_fn=limit_virtual_memory,
378 | # stderr=subprocess.DEVNULL,
379 | # stdout=subprocess.DEVNULL
380 | )
381 | output = proc.communicate(timeout=timeout_seconds)[0]
382 | if os.path.exists(json_output_path):
383 | results = json.load(open(json_output_path)).get("results", [])
384 | return results, output
385 | else:
386 | return None, output
387 | except subprocess.TimeoutExpired:
388 | logging.warning(f"Timeout for {args}")
389 | _kill(proc.pid) # type: ignore
390 | return None, f"Timeout after {timeout_seconds} seconds"
391 | except json.decoder.JSONDecodeError:
392 | logging.warning(f"JSONDecodeError for {args}")
393 | return None, f"JSONDecodeError"
394 | except KeyboardInterrupt as e:
395 | _kill(proc.pid) # type: ignore
396 | raise e
397 |
398 |
399 | def run_hyperfine(code_paths: List[str],
400 | problem_ids: List[str],
401 | path_to_testcases: str,
402 | json_out_path: str, # TODO REMOVE json_out_path
403 | test_cases_list: List[int] = None,
404 | min_runs_per_test_case: int = None,
405 | max_runs_per_test_case: int = None,
406 | strict_runs_per_test_case: bool = False,
407 | warmup_runs_per_test_case: int = 5,
408 | cpu_number: int = None,
409 | do_sanity_check: bool = False,
410 | cflags: str = "--std=c++17 -O3"):
411 | """
412 | will benchmark all in 1 json / 1 run of hyperfine, all on the same cpu
413 | """
414 |
415 | ### TODO: need to change to handle compilation errors and timeouts
416 |
417 | code2benchmarks = defaultdict(list)
418 | benchmark2code = {}
419 | code2results = defaultdict(dict)
420 | code2testcases = defaultdict(list)
421 | if test_cases_list is None:
422 | test_cases_list = [None] * len(code_paths)
423 | for code_path, problem_id, test_case_list in zip(code_paths, problem_ids, test_cases_list):
424 | problem_dir = os.path.join(path_to_testcases, problem_id)
425 | testcases_paths = glob.glob(os.path.join(problem_dir, "input.*.txt"))
426 | if test_case_list is not None:
427 | testcases_paths = [t for t in testcases_paths if int(re.search("(?<=input\.)\d+", t).group(0)) in test_case_list]
428 | test_case_numbers = [int(re.search("(?<=input\.)\d+", t).group(0)) for t in testcases_paths]
429 | code2testcases[code_path] = test_case_numbers
430 | for testcase_path in testcases_paths:
431 | bin_redirect, code_redirect, _ = redirect_cpp_io_and_compile(code_path,
432 | testcase_path,
433 | cpu_number=cpu_number,
434 | cflags=cflags)
435 | code2benchmarks[code_path].append(bin_redirect)
436 | benchmark2code[bin_redirect] = code_path
437 |
438 | cmds = " ".join([bin_redirect for bin_redirects in code2benchmarks.values() for bin_redirect in bin_redirects])
439 | n_cmds = len(cmds.split(" "))
440 | if strict_runs_per_test_case:
441 | assert min_runs_per_test_case is not None
442 | runs_str = f" --runs {min_runs_per_test_case}"
443 | else:
444 | runs_str = ""
445 | if min_runs_per_test_case is not None:
446 | runs_str += f" --min-runs {min_runs_per_test_case}"
447 | if max_runs_per_test_case is not None:
448 | runs_str += f" --max-runs {max_runs_per_test_case}"
449 | if warmup_runs_per_test_case is not None:
450 | runs_str += f" --warmup {warmup_runs_per_test_case}"
451 |
452 | cmd_benchmark = (
453 | f"hyperfine {runs_str} -N {cmds} --export-json {json_out_path} "
454 | )
455 |
456 | if cpu_number is not None:
457 | cmd_benchmark = f"taskset --cpu-list {cpu_number} {cmd_benchmark}"
458 |
459 | if do_sanity_check:
460 | SANITY_CHECK_TIMEOUT = 1.5 * n_cmds
461 | cmd_sanity_check = cmd_benchmark.replace(runs_str, f" --runs 2 --warmup 1 ")
462 | p = subprocess.run(shlex.split(cmd_sanity_check), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=SANITY_CHECK_TIMEOUT, encoding="utf-8")
463 | if p.returncode != 0:
464 | return None, f"Sanity check failed for {cmd_sanity_check}: {p.stderr}"
465 | results, output = run_benchmark(shlex.split(cmd_benchmark), json_out_path)
466 |
467 | for result in results:
468 | command = result["command"]
469 | tc_no = int(re.search("(?<=input\_)\d+", command).group(0))
470 | code2results[benchmark2code[command]][tc_no] = result
471 | for bin, code in benchmark2code.items():
472 | results = code2results[code]
473 | missing_tcs = set(code2testcases[code]) - set(results.keys())
474 | for tc_no in missing_tcs:
475 | results[tc_no] = None
476 | return code2results, output
477 |
478 |
479 |
480 |
481 |
--------------------------------------------------------------------------------
/gem5/gem5_api.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, jsonify
2 | import argparse
3 | import json
4 | import logging
5 | from datetime import datetime
6 | import os
7 | from joblib import Parallel, delayed
8 | import benchmarking
9 | import tempfile
10 | import multiprocessing
11 | import numpy as np
12 | import joblib
13 | from tqdm import tqdm
14 | import contextlib
15 |
16 | LOGGING_DIR="/home/logs/"
17 | if not os.path.exists(LOGGING_DIR):
18 | os.makedirs(LOGGING_DIR)
19 |
20 |
21 | logger = logging.getLogger(__name__)
22 | logger.setLevel(logging.CRITICAL)
23 |
24 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')
25 |
26 | # Create a file handler for the log file
27 | start_date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
28 | file_handler = logging.FileHandler(os.path.join(LOGGING_DIR, start_date_time + "_gem5_api.log"))
29 | file_handler.setLevel(logging.DEBUG)
30 | file_handler.setFormatter(formatter)
31 |
32 | # Create a stream handler to print the logs to stdout
33 | stream_handler = logging.StreamHandler()
34 | stream_handler.setLevel(logging.INFO)
35 | stream_handler.setFormatter(formatter)
36 |
37 | # Add both handlers to the logger
38 | logger.addHandler(file_handler)
39 | logger.addHandler(stream_handler)
40 |
41 |
42 | app = Flask(__name__)
43 |
44 |
45 | global MANAGER
46 | global QUEUE
47 | global N_CPUS
48 | MANAGER = ...
49 | QUEUE = ...
50 | N_CPUS=... # Will be set in init_globals after parse_args()
51 |
52 | @contextlib.contextmanager
53 | def tqdm_joblib(tqdm_object):
54 | """Context manager to patch joblib to report into tqdm progress bar given as argument"""
55 | class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
56 | def __call__(self, *args, **kwargs):
57 | tqdm_object.update(n=self.batch_size)
58 | return super().__call__(*args, **kwargs)
59 |
60 | old_batch_callback = joblib.parallel.BatchCompletionCallBack
61 | joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
62 | try:
63 | yield tqdm_object
64 | finally:
65 | joblib.parallel.BatchCompletionCallBack = old_batch_callback
66 | tqdm_object.close()
67 |
68 |
69 | def init_globals(n_workers: int = -1, use_logical_cpus: bool = False):
70 | global MANAGER
71 | global QUEUE
72 | global N_CPUS
73 |
74 | MANAGER = multiprocessing.Manager()
75 | QUEUE = MANAGER.Queue()
76 | if use_logical_cpus:
77 | cpu_list = benchmarking.add_logicial_cpus_to_queue(n_workers, QUEUE)
78 | else:
79 | cpu_list = benchmarking.add_physical_cpus_to_queue(n_workers, QUEUE)
80 | N_CPUS = len(cpu_list)
81 | print(f"Initialized globals with {N_CPUS} cpus")
82 | return None
83 |
84 |
85 | def parse_args():
86 | parser = argparse.ArgumentParser(description='Gem5 API')
87 | parser.add_argument('--api_key', type=str, help='required API key on initialization for authentication')
88 | parser.add_argument('--port', type=int, default=706965, help='port number')
89 | parser.add_argument('--working_dir', type=str, default='/home/working_dir', help='working directory')
90 | parser.add_argument('--use_logical_cpus', default=False, action="store_true")
91 | parser.add_argument('--workers', type=int, default=-1, help='number of workers, if <0 (e.g. -1) then it uses all available physical cpus')
92 | parser.add_argument('--threaded', default=False, action="store_true")
93 | parser.add_argument('--gem5_acc_threshold', type=float, default=0.95, help="mean threshold where if below this, we do not run gem5")
94 | parser.add_argument('--debug', default=False, action="store_true")
95 | parser.add_argument('--exit_early_on_fail', action="store_true")
96 | ## gem5 and compilation parameters
97 | parser.add_argument('--testcases_dir', type=str, help='testcases directory', default="/home/pie-perf/data/codenet/merged_test_cases/")
98 | parser.add_argument('--cstd', type=str, help='cstd', default='--std=c++17')
99 | parser.add_argument('--optimization_flag', type=str, help='optimization', default='-O3')
100 | parser.add_argument('--gem5_dir', type=str, help='path containing gem5 binary and build', default='/home/gem5/build/X86/')
101 | parser.add_argument('--gem5_script_path', type=str, help='path to gem5 script', default='/home/gem5-skylake-config/gem5-configs/run-se.py')
102 | parser.add_argument('--cpu_type', type=str, help='cpu type', default='Verbatim')
103 | parser.add_argument('--path_to_atcoder', type=str, help='path to atcoder', default='/home/ac-library/')
104 | parser.add_argument('--timeout_seconds_binary', type=int, help='timeout seconds for binary', default=10)
105 | parser.add_argument('--timeout_seconds_gem5', type=int, help='timeout seconds for gem5', default=120)
106 |
107 |
108 | args = parser.parse_args()
109 | app.config.update(vars(args))
110 | return args
111 |
112 | def single_submission(code, testcases, problem_id, timing_env, queue, override_flags=""):
113 | ## TODO -> check if any test cases are missing with hyperfine
114 | logging.info(f"single_submission for problem {problem_id} with timing_env {timing_env} and testcases {testcases}")
115 | override_flags = "" if not isinstance(override_flags, str) else override_flags
116 | result = {}
117 | cpu_number = queue.get(block=True) if timing_env in ("binary", "both") else None
118 | logging.info(f"got cpu {cpu_number} in pid {os.getpid()}")
119 | with tempfile.TemporaryDirectory() as tmpdirname:
120 | code_path = os.path.join(tmpdirname, 'code.cpp')
121 | with open(code_path, 'w') as f:
122 | f.write(code)
123 | print(f"app cfg cstd {app.config['cstd']} app.config['optimization_flag']: {app.config['optimization_flag']} override_flags: {override_flags }")
124 | cflags = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags
125 | bin_path, accs = benchmarking.compile_and_check_outputs(
126 | code_path=code_path,
127 | problem_id=problem_id,
128 | testcases_dir=app.config['testcases_dir'],
129 | timeout=app.config['timeout_seconds_binary'],
130 | cflags=cflags,
131 | testcases=testcases,
132 | cpu_number=cpu_number)
133 | result["compile_success"] = bin_path is not None
134 | result['accs'] = accs
135 | mean_accs = np.mean(list(accs.values()))
136 | logging.info(f"mean_accs: {mean_accs}")
137 | if mean_accs < app.config["gem5_acc_threshold"]:
138 | logging.info(f"mean_accs: {mean_accs} is below threshold {app.config['gem5_acc_threshold']}, skipping gem5")
139 | if timing_env in ["gem5", "both"]:
140 | result["gem5"] = {} # return empty dict
141 | if timing_env in ["binary", "both"]:
142 | result["binary"] = {} # return empty dict
143 | return result
144 |
145 | if timing_env in ['gem5', 'both']:
146 | logging.info(f"running gem5 for problem {problem_id}")
147 | gem5_results = benchmarking.run_gem5(
148 | gem5_dir=app.config['gem5_dir'],
149 | gem5_script_path=app.config['gem5_script_path'],
150 | cpu_type=app.config['cpu_type'],
151 | bin_path=bin_path,
152 | problem_id=problem_id,
153 | testcases_dir=app.config['testcases_dir'],
154 | timeout=app.config['timeout_seconds_gem5'],
155 | testcases=testcases,
156 | cpu_number=cpu_number,
157 | exit_early_on_fail=app.config['exit_early_on_fail'])
158 | result['gem5'] = gem5_results
159 | if timing_env in ['binary', 'both']:
160 | code2results, output = benchmarking.run_hyperfine(
161 | code_paths=[code_path],
162 | problem_ids=[problem_id],
163 | path_to_testcases=app.config['testcases_dir'],
164 | # TODO: REMOVE THIS HERE
165 | json_out_path=os.path.join(tmpdirname, 'hyperfine_results.json'),
166 | test_cases_list=[testcases],
167 | min_runs_per_test_case=10,
168 | max_runs_per_test_case=500,
169 | warmup_runs_per_test_case=5,
170 | cpu_number=cpu_number,
171 | do_sanity_check=True) # TODO: PIN TO CPU
172 | binary_results = code2results[code_path]
173 | result["binary"] = binary_results
174 | queue.put(cpu_number)
175 | return result
176 |
177 |
178 | def dual_submission(code_v0, code_v1, testcases, problem_id, timing_env, queue, override_flags_v0="", override_flags_v1=""):
179 | override_flags_v0 = "" if not isinstance(override_flags_v0, str) else override_flags_v0
180 | override_flags_v1 = "" if not isinstance(override_flags_v1, str) else override_flags_v1
181 | result = {}
182 | cpu_number = queue.get(block=True)
183 | with tempfile.TemporaryDirectory() as tmpdirname_v0, tempfile.TemporaryDirectory() as tmpdirname_v1:
184 | code_path_v0 = os.path.join(tmpdirname_v0, 'code.cpp')
185 | with open(code_path_v0, 'w') as f:
186 | f.write(code_v0)
187 | code_path_v1 = os.path.join(tmpdirname_v1, 'code.cpp')
188 | with open(code_path_v1, 'w') as f:
189 | f.write(code_v1)
190 |
191 | print(f"app cfg cstd {app.config['cstd']} app.config['optimization_flag']: {app.config['optimization_flag']} override_flags_v0: {override_flags_v0 }")
192 | cflags_v0 = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags_v0
193 | cflags_v1 = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags_v1
194 |
195 | bin_path_v0, accs_v0 = benchmarking.compile_and_check_outputs(
196 | code_path=code_path_v0,
197 | problem_id=problem_id,
198 | testcases_dir=app.config['testcases_dir'],
199 | timeout=app.config['timeout_seconds_binary'],
200 | cflags=cflags_v0,
201 | testcases=testcases,
202 | cpu_number=cpu_number)
203 | bin_path_v1, accs_v1 = benchmarking.compile_and_check_outputs(
204 | code_path=code_path_v1,
205 | problem_id=problem_id,
206 | testcases_dir=app.config['testcases_dir'],
207 | timeout=app.config['timeout_seconds_binary'],
208 | cflags=cflags_v1,
209 | testcases=testcases,
210 | cpu_number=cpu_number)
211 | result["compile_success_v0"] = bin_path_v0 is not None
212 | result["compile_success_v1"] = bin_path_v1 is not None
213 | result['accs_v0'] = accs_v0
214 | result['accs_v1'] = accs_v1
215 | if timing_env in ['gem5', 'both']:
216 | gem5_results_v0 = benchmarking.run_gem5(
217 | gem5_dir=app.config['gem5_dir'],
218 | gem5_script_path=app.config['gem5_script_path'],
219 | cpu_type=app.config['cpu_type'],
220 | bin_path=bin_path_v0,
221 | problem_id=problem_id,
222 | testcases_dir=app.config['testcases_dir'],
223 | timeout=app.config['timeout_seconds_gem5'],
224 | testcases=testcases,
225 | cpu_number=cpu_number,
226 | exit_early_on_fail=app.config['exit_early_on_fail'])
227 | result['gem5_v0'] = gem5_results_v0
228 | gem5_results_v1 = benchmarking.run_gem5(
229 | gem5_dir=app.config['gem5_dir'],
230 | gem5_script_path=app.config['gem5_script_path'],
231 | cpu_type=app.config['cpu_type'],
232 | bin_path=bin_path_v1,
233 | problem_id=problem_id,
234 | testcases_dir=app.config['testcases_dir'],
235 | timeout=app.config['timeout_seconds_gem5'],
236 | testcases=testcases,
237 | cpu_number=cpu_number,
238 | exit_early_on_fail=app.config['exit_early_on_fail'])
239 | result['gem5_v1'] = gem5_results_v1
240 | if timing_env in ['binary', 'both']:
241 | code2results, output = benchmarking.run_hyperfine(
242 | code_paths=[code_path_v0, code_path_v1],
243 | problem_ids=[problem_id, problem_id],
244 | path_to_testcases=app.config['testcases_dir'],
245 | json_out_path=os.path.join(tmpdirname_v0, 'hyperfine_results.json'),
246 | test_cases_list=[testcases, testcases],
247 | min_runs_per_test_case=10,
248 | max_runs_per_test_case=500,
249 | warmup_runs_per_test_case=5,
250 | cpu_number=cpu_number,
251 | do_sanity_check=True)
252 | result["binary_v0"] = code2results[code_path_v0]
253 | result["binary_v1"] = code2results[code_path_v1]
254 | queue.put(cpu_number)
255 | return result
256 |
257 |
258 | def multiple_single_submissions(code_list, testcases_list, problem_id_list, timing_env, queue, cpus, override_flags_list=None):
259 | assert len(code_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list)
260 | with tqdm_joblib(tqdm(desc="Running multiple single submissions", total=len(code_list))) as progress_bar:
261 | results = Parallel(n_jobs=cpus, verbose=10, backend="multiprocessing")(delayed(single_submission)(code, testcases, problem_id, timing_env, queue, override_flags) for code, testcases, problem_id, override_flags in zip(code_list, testcases_list, problem_id_list, override_flags_list))
262 | return results
263 |
264 | def multiple_dual_submissions(code_v0_list, code_v1_list, testcases_list, problem_id_list, timing_env, queue, cpus, override_flags_list_v0, override_flags_list_v1):
265 | assert len(code_v0_list) == len(code_v1_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list_v0) == len(override_flags_list_v1)
266 | results = Parallel(n_jobs=cpus, verbose=10, backend="multiprocessing")(delayed(dual_submission)(code_v0, code_v1, testcases, problem_id, timing_env, queue, override_flags_v0, override_flags_v1) for code_v0, code_v1, testcases, problem_id, override_flags_v0, override_flags_v1 in zip(code_v0_list, code_v1_list, testcases_list, problem_id_list, override_flags_list_v0, override_flags_list_v1))
267 | return results
268 |
269 |
270 | @app.route('/gem5/single_submission', methods=['GET'])
271 | def SingleSubmission():
272 | req = request.get_json()
273 | if req["api_key"] != app.config["api_key"]:
274 | return jsonify({"error": "Invalid API key"})
275 | code = req['code']
276 | testcases = req['testcases']
277 | problem_id = req['problem_id']
278 | timing_env = req['timing_env']
279 | assert len(testcases) > 0
280 | assert len(code) > 0
281 | assert timing_env in ['gem5', 'binary', 'both']
282 |
283 | override_flags = req.get('override_flags', "")
284 | results = single_submission(code, testcases, problem_id, timing_env, QUEUE, override_flags)
285 | return jsonify(results)
286 |
287 | @app.route('/gem5/multiple_single_submissions', methods=['GET'])
288 | def MultipleSubmissions():
289 | req = request.get_json()
290 | if req["api_key"] != app.config["api_key"]:
291 | return jsonify({"error": "Invalid API key"})
292 | submissions = req['submissions']
293 | timing_env = req['timing_env']
294 | code_list = [r['code'] for r in submissions]
295 | testcases_list = [r['testcases'] for r in submissions]
296 | problem_id_list = [r['problem_id'] for r in submissions]
297 | override_flags_list = [r.get('override_flags_list', "") for r in submissions]
298 |
299 | assert len(code_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list)
300 | assert timing_env in ['gem5', 'binary', 'both']
301 | assert len(code_list) > 0
302 | assert len(testcases_list) > 0
303 | assert len(problem_id_list) > 0
304 | assert len(override_flags_list) > 0
305 | assert all([len(code) > 0 for code in code_list])
306 | assert all([len(testcases) > 0 for testcases in testcases_list])
307 |
308 | results = multiple_single_submissions(code_list, testcases_list, problem_id_list, timing_env, QUEUE, N_CPUS, override_flags_list)
309 |
310 | return jsonify(results)
311 |
312 | @app.route('/gem5/single_submission_pair', methods=['GET'])
313 | def SingleSubmissionPair():
314 | req = request.get_json()
315 | if req["api_key"] != app.config["api_key"]:
316 | return jsonify({"error": "Invalid API key"})
317 | #assert len(req) == 2
318 | code_v0 = req['code_v0']
319 | code_v1 = req['code_v1']
320 | testcases = req['testcases']
321 | problem_id = req['problem_id']
322 | timing_env = req['timing_env']
323 | assert len(testcases) > 0
324 | assert len(code_v0) > 0
325 | assert len(code_v1) > 0
326 | assert timing_env in ['gem5', 'binary', 'both']
327 |
328 | override_flags = req.get('override_flags', "")
329 | results = dual_submission(code_v0, code_v1, testcases, problem_id, timing_env, QUEUE, override_flags)
330 | return jsonify(results)
331 |
332 | @app.route('/gem5/multiple_submissions_pairs', methods=['GET'])
333 | def MultipleSubmissionsPair():
334 | req = request.get_json()
335 | if req["api_key"] != app.config["api_key"]:
336 | return jsonify({"error": "Invalid API key"})
337 | submissions_v0 = req['submissions_v0']
338 | submissions_v1 = req['submissions_v1']
339 | timing_env = req['timing_env']
340 |
341 | code_list_v0 = [r['code'] for r in submissions_v0]
342 | code_list_v1 = [r['code'] for r in submissions_v1]
343 | testcases_list = [r['testcases'] for r in submissions_v0]
344 | problem_id_list = [r['problem_id'] for r in submissions_v0]
345 |
346 | override_flags_list_v0 = [r.get('override_flags_list', "") for r in submissions_v0]
347 | override_flags_list_v1 = [r.get('override_flags_list', "") for r in submissions_v1]
348 |
349 | assert len(code_list_v0) == len(testcases_list) == len(problem_id_list) == len(override_flags_list_v0) == len(code_list_v1) == len(override_flags_list_v1)
350 | assert timing_env in ['gem5', 'binary', 'both']
351 | assert len(code_list_v0) > 0
352 | assert len(testcases_list) > 0
353 | assert len(problem_id_list) > 0
354 | assert all([len(code) > 0 for code in code_list_v0])
355 | assert all([len(code) > 0 for code in code_list_v1])
356 | assert all([len(testcases) > 0 for testcases in testcases_list])
357 |
358 | results = multiple_dual_submissions(code_list_v0, code_list_v1, testcases_list, problem_id_list, timing_env, QUEUE, N_CPUS, override_flags_list_v0, override_flags_list_v1)
359 | return jsonify(results)
360 |
361 | @app.route('/gem5/ping', methods=['GET'])
362 | def Ping():
363 | return jsonify({"status": "ok"})
364 |
365 |
366 | if __name__ == '__main__':
367 | args = parse_args()
368 | init_globals(args.workers, args.use_logical_cpus)
369 | app.run(host="0.0.0.0", port=args.port, debug=args.debug)
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
--------------------------------------------------------------------------------
/gem5/gem5_eval.py:
--------------------------------------------------------------------------------
1 | # from src.codenet_eval.run_eval import (read_ground_truths, read_inputs_and_prepare)
2 | # from src.codenet_eval.evalconfig import EvaluationConfig
3 | import tarfile
4 | import shutil
5 | import tempfile
6 | import logging
7 | import pandas as pd
8 | import json
9 | import os
10 | import pdb
11 | import argparse
12 | from gem5.simulator import PieEnvironment
13 | from gem5 import simulator
14 | import traceback
15 | import pdb
16 | import threading
17 | from tqdm import tqdm
18 | import re
19 | from typing import Optional, Any
20 | import yaml
21 | from dataclasses import dataclass, field
22 | import ast
23 |
24 | logging.basicConfig(level=logging.INFO)
25 |
26 | import signal
27 | import time
28 |
29 | KEY_COLS = ["n_tests",
30 | "problem_id",
31 | "tests"
32 | "src_id",
33 | "tgt_id",
34 | "fastest_runtime", "fastest_accuracy"]
35 |
36 |
37 | def get_key_columns(df, cfg):
38 | ## in key columns or if
39 | ## *_test_compilation, *_test_accuracy, *_test_agg_runtime, *_tc2time
40 | key_cols = [c for c in df.columns if c in KEY_COLS or c.endswith("_compilation") or c.endswith("_accuracy") or c.endswith("_runtime") or c.endswith("_tc2time")]
41 | key_cols += [c for c in df.columns if cfg.model_generated_potentially_faster_code_col in c] + [cfg.slow_code_col, cfg.reference_code_col]
42 | key_cols = list(set(key_cols))
43 | return df[key_cols]
44 |
45 | def _fix_value(x: Any) -> Any:
46 | ## if starts with '[' and ends with ']', as a string, then convert to list
47 | if isinstance(x, str) and len(x) > 1 and x[0] == '[' and x[-1] == ']':
48 | x = ast.literal_eval(x)
49 | return x
50 |
51 | def fix_df_columns(df):
52 | for col in df.columns:
53 | df[col] = df[col].apply(lambda x: _fix_value(x))
54 | return df
55 |
56 |
57 |
58 | def unmelt_results(results_df, cfg, remove_extra_cols=False):
59 | unmelted_data = []
60 | for src_id, group in results_df.groupby("src_id"):
61 | src_code_row = group[group["code_type"] == "src_code"].iloc[0]
62 | new_row = src_code_row.to_dict()
63 | for index, row in group.iterrows():
64 | new_row["src_id"] = src_id
65 | new_row[f'{row["code_type"]}_compilation'] = row["compilation"]
66 | new_row[f'{row["code_type"]}'] = row["code"]
67 | if row["code_type"].startswith(cfg.model_generated_potentially_faster_code_col) or cfg.redo_src_tgt:
68 | new_row[f'{row["code_type"]}_accuracy'] = row["accuracy"]
69 | new_row[f'{row["code_type"]}_agg_runtime'] = row["agg_runtime"]
70 | new_row[f'{row["code_type"]}_tc2time'] = row["tc2time"]
71 | unmelted_data.append(new_row)
72 | ## clean up the column names
73 | unmelted_df = pd.DataFrame(unmelted_data)
74 | if remove_extra_cols:
75 | unmelted_df = get_key_columns(unmelted_df, cfg)
76 |
77 | # unmelted_df = rename_columns(unmelted_df)
78 |
79 | return unmelted_df
80 |
81 | def report_results(df, cfg, orig_df):
82 | ## all columns will be cfg.model_generated_potentially_faster_code_col_*
83 | ## for these, consider only use those that are not None, above threshold_accuracy, and have the fastest_runtime
84 | ## for those, keep the runtime, but if the accuracy is below threshold_accuracy, set the runtime to float("inf")
85 |
86 | ## then consider only max_generations_to_report
87 |
88 | ## in 1, 2, 4... (powers of 2 up until len(runtimes)), report the best runtime
89 | ## as runtime_best@1, runtime_best@2, runtime_best@4, etc. accuracy_best@1, accuracy_best@2, accuracy_best@4, etc.
90 | ## while also reporting speedup_best@1, speedup_best@2, speedup_best@4, etc. where speedup = runtime_src / runtime_best@n
91 |
92 |
93 | ## then aggregate
94 | ### 1. for each 1, 2.. (powers of 2 up until len(runtimes)), report mean_accuracy@n, mean_speedup@n where we also take speedup = min(1.0, runtime_src / runtime_best@n)
95 | ### 2. for each 1, 2.. (powers of 2 up until len(runtimes)), report the % of programs where the speedup is >= 1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0
96 |
97 | # merged[f"{cfg.model_generated_potentially_faster_code_col}_{i}"] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x[i] if i < len(x) else None)
98 | import pdb
99 | # pdb.set_trace()
100 | # print("columns before report_results")
101 | # print(df.columns)
102 |
103 |
104 | # num_generated_cols = len([c for c in df.columns if re.match(f"{cfg.model_generated_potentially_faster_code_col}_[0-9]+", c) or c == cfg.model_generated_potentially_faster_code_col])
105 | num_generated_cols = cfg.num_generated_cols
106 | assert num_generated_cols is not None, f"num_generated_cols is None, it should have been set in read_inputs_and_prepare_v2"
107 |
108 | import pandas as pd
109 | import numpy as np
110 |
111 | # Assuming orig_df and df are already defined, and cfg and num_generated_cols are given
112 |
113 | # Step 1: Find rows in orig_df that are not in df
114 | # do this with src_code not src_id
115 | print(f"length of orig_df {len(orig_df)} vs length of results_df {len(df)}")
116 | orig_df["src_tgt_code"] = orig_df[cfg.slow_code_col] + orig_df[cfg.reference_code_col]
117 | df["src_tgt_code"] = df[cfg.slow_code_col] + df[cfg.reference_code_col]
118 | # drop duplicates from both
119 | df = df.drop_duplicates(subset=["src_tgt_code"])
120 | orig_df = orig_df.drop_duplicates(subset=["src_tgt_code"])
121 | unique_rows = orig_df[~orig_df['src_tgt_code'].isin(df['src_tgt_code'])]
122 | assert len(unique_rows) == (len(orig_df) - len(df)), f"len(unique_rows) {len(unique_rows)} == len(orig_df) - len(df) {len(orig_df) - len(df)}"
123 |
124 | # Step 2: Create additional columns for the unique rows and set default values
125 | for j in range(num_generated_cols + 1): # Adding 1 to include the case when j == num_generated_cols
126 | colname = f"{cfg.model_generated_potentially_faster_code_col}_{j}" if num_generated_cols > 0 else cfg.model_generated_potentially_faster_code_col
127 | unique_rows[f"{colname}_agg_runtime"] = float("inf") # Setting runtime to inf
128 | unique_rows[f"{colname}_accuracy"] = 0 # Setting accuracy to 0
129 | unique_rows[f"{colname}_tc2time"] = [{} for _ in range(len(unique_rows))] # Setting tc2time to {}
130 | # drop unique rows columns that are not in df
131 | unique_rows = unique_rows[[c for c in unique_rows.columns if c in df.columns]]
132 |
133 | # Step 3: Append the modified unique rows to df
134 | df = pd.concat([df, unique_rows], ignore_index=True)
135 |
136 | print(f"columns after appending {df.columns}")
137 | print(f"unique rows columns {unique_rows.columns}")
138 | assert len(df) == 978, f"len(df) {len(df)} == 978"
139 |
140 | new_rows = []
141 | for i, row in df.iterrows():
142 | for j in range(num_generated_cols):
143 | colname = f"{cfg.model_generated_potentially_faster_code_col}_{j}" if num_generated_cols > 0 else cfg.model_generated_potentially_faster_code_col
144 | if row[colname] is None or pd.isna(row[colname]) or pd.isnull(row[colname]):
145 | row[f"{colname}_agg_runtime_adjusted"] = float("inf")
146 | if row[f"{colname}_accuracy"] < cfg.threshold_accuracy:
147 | row[f"{colname}_agg_runtime_adjusted"] = float("inf")
148 | else:
149 | row[f"{colname}_agg_runtime_adjusted"] = row[f"{colname}_agg_runtime"]
150 | row["fastest_generated_agg_runtime"] = min([row[f"{cfg.model_generated_potentially_faster_code_col}_{j}_agg_runtime_adjusted"] for j in range(num_generated_cols)])
151 | new_rows.append(row)
152 |
153 | df = pd.DataFrame(new_rows)
154 |
155 | problem_id_to_fastest_agg_runtime = {}
156 | problem_id_to_fastest_correctness = {}
157 | for i, group in df.groupby("problem_id"):
158 | problem_id_to_fastest_agg_runtime[i] = group["fastest_generated_agg_runtime"].min()
159 | problem_id_to_fastest_correctness[i] = problem_id_to_fastest_agg_runtime[i] < float("inf")
160 |
161 | df["fastest_generated_runtime_over_all_submissions"] = df["problem_id"].apply(lambda x: problem_id_to_fastest_agg_runtime[x])
162 | df["fastest_generated_speedup_over_all_submissions"] = df[cfg.slow_code_col+"_agg_runtime"] / df["fastest_generated_runtime_over_all_submissions"]
163 | df["fastest_generated_speedup_over_all_submissions"] = df["fastest_generated_speedup_over_all_submissions"].apply(lambda x: max(1.0, x))
164 | df["fastest_generated_correctness_over_all_submissions"] = df["problem_id"].apply(lambda x: problem_id_to_fastest_correctness[x])
165 |
166 |
167 | for i in range(1, num_generated_cols+1):
168 | if num_generated_cols == 0:
169 | df[f"agg_runtime_best@{i}"] = df[f"{cfg.model_generated_potentially_faster_code_col}_agg_runtime_adjusted"]
170 | df[f"accuracy_best@{i}"] = df[f"{cfg.model_generated_potentially_faster_code_col}_accuracy"]
171 | df[f"is_correct_best@{i}"] = df[f"accuracy_best@{i}"] == cfg.threshold_accuracy
172 | else:
173 | df[f"agg_runtime_best@{i}"] = df[[f"{cfg.model_generated_potentially_faster_code_col}_{j}_agg_runtime_adjusted" for j in range(i)]].min(axis=1)
174 | df[f"accuracy_best@{i}"] = df[[f"{cfg.model_generated_potentially_faster_code_col}_{j}_accuracy" for j in range(i)]].max(axis=1)
175 | df[f"is_correct_best@{i}"] = df[f"accuracy_best@{i}"] == cfg.threshold_accuracy
176 | df[f"speedup_best@{i}"] = df[cfg.slow_code_col+"_agg_runtime"] / df[f"agg_runtime_best@{i}"]
177 | df[f"speedup_best@{i}"] = df[f"speedup_best@{i}"].apply(lambda x: max(1.0, x))
178 | df["speedup_of_fastest_generated_of_all_submissions"] = df[cfg.slow_code_col+"_agg_runtime"] / df["fastest_generated_runtime_over_all_submissions"]
179 | df["speedup_of_fastest_generated_of_all_submissions"] = df["speedup_of_fastest_generated_of_all_submissions"].apply(lambda x: max(1.0, x))
180 |
181 | ## aggregate over all rows
182 | agg_df = pd.DataFrame(index=[0])
183 | # agg_df["fastest_generated_runtime_over_all_submissions"] = df["fastest_generated_runtime_over_all_submissions"].mean()
184 | agg_df["fastest_generated_correctness_over_all_submissions"] = df["fastest_generated_correctness_over_all_submissions"].mean()
185 | agg_df["fastest_generated_speedup_over_all_submissions"] = df["fastest_generated_speedup_over_all_submissions"].mean()
186 | # import pdb
187 | for i in range(1, num_generated_cols+1):
188 | # pdb.set_trace()
189 | agg_df[f"mean_accuracy_best@{i}"] = df[f"accuracy_best@{i}"].mean()
190 | agg_df[f"is_correct_best@{i}"] = df[f"is_correct_best@{i}"].mean()
191 | agg_df[f"mean_speedup_best@{i}"] = df[f"speedup_best@{i}"].mean()
192 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
193 | agg_df[f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}"] = (df[f"speedup_best@{i}"] >= speedup_threshold).mean()
194 |
195 | ## add the speedup of tgt_code over src_code and the threshold speedups of tgt_code over src_code
196 | df["speedup_tgt_over_src"] = df[cfg.slow_code_col+"_agg_runtime"] / df[cfg.reference_code_col+"_agg_runtime"]
197 | agg_df["mean_speedup_tgt_over_src"] = df["speedup_tgt_over_src"].mean()
198 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
199 | agg_df[f"percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}"] = (df["speedup_tgt_over_src"] >= speedup_threshold).mean()
200 | agg_df[f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}"] = (df["speedup_of_fastest_generated_of_all_submissions"] >= speedup_threshold).mean()
201 |
202 | ## pretty print out a report
203 |
204 | ## first print out the columns with asterisks separating fields *********
205 | print("********* Aggregated Results *********")
206 | for i in range(1, num_generated_cols+1):
207 | print(f"********* Results Best at {i} Generations *********")
208 | mean_accuracy = agg_df[f"mean_accuracy_best@{i}"][0]
209 | mean_speedup = agg_df[f"mean_speedup_best@{i}"][0]
210 |
211 | print(f"mean_accuracy_best@{i}: {mean_accuracy}")
212 | print(f"mean correctness best@{i}: {agg_df[f'is_correct_best@{i}'][0]}")
213 | print(f"mean_speedup_best@{i}: {mean_speedup} vs. mean_speedup_tgt_over_src: {agg_df['mean_speedup_tgt_over_src'][0]}")
214 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
215 | percent_programs = agg_df[f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}"][0]
216 | percent_programs_tgt_over_src = agg_df[f"percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}"][0]
217 | print(f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}: {percent_programs} vs. percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}: {percent_programs_tgt_over_src}")
218 | print("*****************************************")
219 | print("********* Results Fastest Generated Over All Submissions *********")
220 | print("mean correctness fastest_generated_over_all_submissions: ", agg_df["fastest_generated_correctness_over_all_submissions"][0])
221 | print("average fastest_generated_speedup_over_all_submissions: ", agg_df["fastest_generated_speedup_over_all_submissions"][0])
222 | for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
223 | percent_programs = agg_df[f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}"][0]
224 | print(f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}: {percent_programs}")
225 | print("********* End Aggregated Results *********")
226 |
227 | return agg_df, df
228 |
229 | # global env #: PieEnvironment
230 | global env
231 | env = None
232 |
233 | def sigint_handler(signum, frame):
234 | global env
235 | print("Ctrl-C pressed, running teardown...")
236 | if threading.current_thread().name == "MainThread":
237 | env.teardown()
238 | print("Teardown complete, exiting...")
239 | exit(0)
240 |
241 | # Set the signal handler for Ctrl+C (SIGINT)
242 | signal.signal(signal.SIGINT, sigint_handler)
243 |
244 |
245 |
246 | def read_inputs_and_prepare_v2(cfg) -> pd.DataFrame:
247 | """Reads the model generated output, the reference, joins them, and returns a dataframe with the merged data."""
248 | logging.info(f"Reading reference file from {cfg.reference_file_path}")
249 | logging.info(f"Reading model generated outputs from {cfg.model_generated_outputs_path}")
250 |
251 |
252 | gen_df = pd.read_json(
253 | cfg.model_generated_outputs_path, lines=True, orient="records"
254 | )
255 | gen_df = fix_df_columns(gen_df)
256 |
257 | logging.info(f"Read {len(gen_df)} rows from {cfg.model_generated_outputs_path}")
258 | if cfg.is_prompt_based:
259 | gen_df["slower_program"] = gen_df.apply(
260 | lambda x: get_input_from_prompt(x), axis=1
261 | )
262 | else:
263 | gen_df["slower_program"] = gen_df[cfg.slow_code_col].apply(lambda x: x.strip())
264 |
265 |
266 | assert (
267 | cfg.reference_code_col in gen_df.columns
268 | ), f"Column {cfg.reference_code_col} not found in {cfg.model_generated_outputs_path}"
269 | merged = gen_df
270 |
271 |
272 | merged = merged[merged[cfg.slow_code_col] != merged[cfg.reference_code_col]]
273 |
274 | assert (
275 | len(merged) > 0
276 | ), f"{cfg.slow_code_col} and {cfg.reference_code_col} are the same for all programs"
277 |
278 | if cfg.num_problems_to_evaluate != -1:
279 | merged = merged[: cfg.num_problems_to_evaluate]
280 |
281 |
282 | # if the generated code is a list, then we have multiple generations per input.
283 | # we add one column per generation
284 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], list) or isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], pd.Series) or (merged[cfg.model_generated_potentially_faster_code_col].iloc[0][0] == '[' and merged[cfg.model_generated_potentially_faster_code_col].iloc[0][-1] == ']'):
285 |
286 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], str):
287 | import ast
288 | merged[cfg.model_generated_potentially_faster_code_col] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: ast.literal_eval(x))
289 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], pd.Series):
290 | merged[cfg.model_generated_potentially_faster_code_col] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x.tolist())
291 | num_generations = max(merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: len(x)).tolist())
292 |
293 | for i in range(num_generations):
294 | merged[f"{cfg.model_generated_potentially_faster_code_col}_{i}"] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x[i] if i < len(x) else None)
295 | # so merged will have the same number of columns for all rows, but some rows will have None in some columns (because they have fewer generations)
296 | else:
297 | num_generations = 1
298 |
299 | cfg.num_generated_cols = num_generations
300 |
301 | return merged
302 |
303 |
304 |
305 | def main(cfg):
306 | # Step 0
307 | merged = read_inputs_and_prepare_v2(cfg)
308 | reference_df = pd.read_json(cfg.reference_file_path, lines=True, orient="records")
309 |
310 | logging.info(f"Number of programs to evaluate: {len(merged)}")
311 | logging.info(f"Input column: {cfg.slow_code_col}")
312 | logging.info(f"Reference column: {cfg.reference_code_col}")
313 | logging.info(f"Model generated column: {cfg.model_generated_potentially_faster_code_col}")
314 |
315 | # Step 1: Read the inputs
316 |
317 | # problem_id_to_ground_truths = read_ground_truths(cfg, merged)
318 |
319 | # Step 2: Write the inputs to a temporary directory
320 |
321 | tempdir = tempfile.TemporaryDirectory()
322 |
323 | ## we need to melt the dataframe from [slow, fast, generated_i] -> column of code_type and column of code
324 | generated_cols = []
325 | if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], list):
326 | generated_cols = [colname for colname in merged.columns if colname.startswith(cfg.model_generated_potentially_faster_code_col) and colname[-1].isdigit()]
327 | else:
328 | generated_cols = [cfg.model_generated_potentially_faster_code_col]
329 |
330 | logging.info(f"Generated columns: {generated_cols}")
331 | code_cols = [cfg.slow_code_col, cfg.reference_code_col] + generated_cols
332 |
333 | ##PATCH
334 | ## rename src_agg_runtime -> src_code_agg_runtime and tgt_agg_runtime -> tgt_code_agg_runtime
335 | if "src_agg_runtime" in merged.columns and "tgt_agg_runtime" in merged.columns:
336 | merged = merged.rename(columns={"src_agg_runtime": cfg.slow_code_col+"_agg_runtime", "tgt_agg_runtime": cfg.reference_code_col+"_agg_runtime"})
337 |
338 | melted = pd.melt(merged,
339 | value_vars=code_cols,
340 | var_name="code_type",
341 | value_name="code",
342 | id_vars = [c for c in merged.columns if c not in code_cols])
343 |
344 | orig_len = len(melted)
345 | #drop code na/null
346 | melted = melted.dropna(subset=["code"])
347 |
348 | # sort by "n_tests"
349 | melted = melted.sort_values(by=["n_tests"], ascending=False)
350 |
351 | if not os.path.exists(os.path.join(cfg.output_dir, "test_results.jsonl")):
352 | # drop any rows where the code length is 0
353 | melted = melted[melted["code"].apply(lambda x: len(x) > 0)]
354 | logging.info(f"Dropped {orig_len - len(melted)} rows with NA or empty code")
355 |
356 | if not cfg.redo_src_tgt:
357 | ## remove and cache the rows where code_type == "src_code" or "tgt_code"
358 | src_tgt_rows = melted[(melted["code_type"] == f"{cfg.slow_code_col}") | (melted["code_type"] == f"{cfg.reference_code_col}")]
359 | melted = melted[(melted["code_type"] != f"{cfg.slow_code_col}") & (melted["code_type"] != f"{cfg.reference_code_col}")]
360 | # pdb.set_trace()
361 | else:
362 | ## if we're re-running the src_code and tgt_code, then cache the old agg_runtimes
363 | orig_src_colname = cfg.slow_code_col.replace("_code", "_agg_runtime")
364 | orig_tgt_colname = cfg.reference_code_col.replace("_code", "_agg_runtime")
365 | new_src_colname = cfg.slow_code_col.replace("_code", "_original_agg_runtime")
366 | new_tgt_colname = cfg.reference_code_col.replace("_code", "_original_agg_runtime")
367 | melted.rename(columns={orig_src_colname: new_src_colname, orig_tgt_colname: new_tgt_colname}, inplace=True)
368 |
369 | print(f"Number of programs to evaluate after dropping NA: {len(melted)}")
370 | try:
371 | if not os.path.exists(cfg.output_dir):
372 | os.makedirs(cfg.output_dir)
373 | global env
374 | env = simulator.make(timeout_seconds_gem5=120, verbose=True, use_logical_cpus=True, port=8888, workers=40, exit_early_on_fail=True)
375 | ## iterate in batches of cpus_available, env.submit_mutliple_single_submissions() will submit the batch at once
376 | new_rows = []
377 | pbar = tqdm(total=len(melted), desc=f"Submitting {len(melted)} programs to evaluate", smoothing=0)
378 | if cfg.cpus_available == -1:
379 | cfg.cpus_available = len(melted)
380 | # legacy - we used to submit in batches
381 | batch = melted
382 | # currently sorting the list of tests in reverse order of length, so that the (potentially) longest tests are run first
383 | # this will may give more "conservative" estimates of the runtime with tqdm
384 | results = env.submit_multiple_single_submissions(batch["code"].tolist(),
385 | [sorted(list(t), reverse=True) for t in batch["tests"].tolist()],
386 | batch["problem_id"].tolist(),
387 | "gem5")
388 |
389 | # zip the rows and results together
390 | for (i, row), result in zip(batch.iterrows(), results):
391 | row["compilation"] = result.compilation
392 | row["accuracy"] = result.mean_acc
393 | row["agg_runtime"] = result.agg_runtime
394 | row["tc2time"] = result.tc2time
395 | row["tc2stats"] = result.tc2stats # this is a lot of data, toggle if we need all the outputs from gem5's stats.txt
396 | new_rows.append(row)
397 | # pbar.update(len(batch))
398 | melted = pd.DataFrame(new_rows)
399 | melted.to_json(
400 | f"{cfg.output_dir}/melted_test_results.jsonl",
401 | orient="records",
402 | lines=True
403 | )
404 | env.teardown()
405 | ## if we get an exception, we still want to teardown the environment because it will likely leave a docker container running
406 | except Exception as e:
407 | print(e)
408 | traceback.print_exc()
409 | if threading.current_thread().name == "MainThread":
410 | # global env
411 | env.teardown()
412 | raise e
413 |
414 | if not cfg.redo_src_tgt:
415 | ## add back the src_code and tgt_code rows
416 | melted = pd.concat([melted, src_tgt_rows])
417 |
418 | unmelted_df = unmelt_results(melted, cfg)
419 |
420 | unmelted_df.to_json(
421 | f"{cfg.output_dir}/test_results.jsonl",
422 | orient="records",
423 | lines=True
424 | )
425 | else:
426 | unmelted_df = pd.read_json(
427 | f"{cfg.output_dir}/test_results.jsonl",
428 | orient="records",
429 | lines=True
430 | )
431 |
432 | agg_df, result_df = report_results(unmelted_df, cfg, reference_df)
433 |
434 | agg_df.to_csv(
435 | f"{cfg.output_dir}/aggregated_results.csv",
436 | index=False
437 | )
438 |
439 | result_df.to_json(
440 | f"{cfg.output_dir}/addtl_stats.jsonl",
441 | orient="records",
442 | lines=True
443 | )
444 |
445 | print(f"Results written to {cfg.output_dir}")
446 |
447 |
448 | @dataclass
449 | class EvaluationConfig:
450 | model_generated_outputs_path: str
451 | output_dir: str
452 | reference_file_path: str
453 | is_prompt_based: bool = False
454 | model_generated_potentially_faster_code_col: str = "generated_answers"
455 | slow_code_col: str = "src_code"
456 | reference_code_col: str = "tgt_code"
457 | cpuset_cpus: Optional[str] = None
458 | do_eval: bool = False
459 | cpus_available: int = 1
460 | num_problems_to_evaluate: int = -1
461 | threshold_accuracy: float = 1.0
462 | redo_src_tgt: bool = False
463 | num_generated_cols: int = None
464 |
465 | def load_config(yaml_path: str) -> EvaluationConfig:
466 | with open(yaml_path, 'r') as f:
467 | config_dict = yaml.safe_load(f)
468 | return EvaluationConfig(**config_dict)
469 |
470 | if __name__ == "__main__":
471 | parser = argparse.ArgumentParser()
472 | parser.add_argument("--config_path", type=str, required=True)
473 | args = parser.parse_args()
474 | config = load_config(args.config_path)
475 | main(config)
--------------------------------------------------------------------------------
/gem5/pytest_simulator.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from gem5 import simulator
3 | from gem5.simulator import PieEnvironment, PieSingleResult, PiePairResult, make
4 | import numpy as np
5 | from collections import defaultdict
6 | from pprint import pprint
7 |
8 | API_KEY="cdZ5TynkL5D7gCTFvzJT4YKu05aozTLp4GgIcK5"
9 |
10 | example_1_code = """
11 | #include
12 | #define REP(i, n) for (int i = 0; i < (n); i++)
13 | using namespace std;
14 | const int MOD = 998244353;
15 |
16 | int main() {
17 | cin.tie(0)->sync_with_stdio(false);
18 |
19 | int n, k; cin >> n >> k;
20 | vector l(k), r(k);
21 | REP(i, k) cin >> l[i] >> r[i];
22 | REP(i, k) r[i]++;
23 |
24 | vector dp(n + 1, 0);
25 | dp[0] = 1;
26 | dp[1] = -1;
27 | REP(i, n) {
28 | if (i > 0)
29 | dp[i] = (dp[i] + dp[i - 1]) % MOD;
30 | REP(j, k) {
31 | if (i + l[j] < n)
32 | dp[i + l[j]] = (dp[i + l[j]] + dp[i]) % MOD;
33 | if (i + r[j] < n)
34 | dp[i + r[j]] = (((dp[i + r[j]] - dp[i]) % MOD) + MOD) % MOD;
35 | }
36 | }
37 | cout << dp[n - 1] << endl;
38 | return 0;
39 | }
40 | """
41 | example_1_problem_id = "p02549"
42 |
43 | example_2_code = """
44 | #include
45 | #include
46 | typedef long long ll;
47 | typedef unsigned int ui;
48 | #define infin (ll)(998244353)
49 | using namespace std;
50 | int main()
51 | {
52 | int n,k;
53 | cin>>n>>k;
54 | int l,r;
55 | vector dp(n+1,0); //0 to n
56 | vector >v;
57 | for(int j=0;j>l>>r;
60 | v.push_back({l,r});
61 | }
62 | dp[0]=1;;
63 | dp[1]=1;
64 | sort(v.begin(),v.end());
65 | auto z=v.begin();
66 | if ((*z).first==1)
67 | dp[2]=1;
68 | else
69 | dp[2]=0;
70 | for(int i=3;i<=n;i++)
71 | {
72 | dp[i]=dp[i-1];
73 | for (auto x:v)
74 | {
75 | if (i>x.first)
76 | dp[i]+=dp[i-x.first];
77 | else
78 | break;
79 | if (i-1>x.second)
80 | {
81 | dp[i]-=dp[i-1-x.second];
82 | if (dp[i]<0)
83 | dp[i]+=infin;
84 | }
85 | }
86 | dp[i]=(dp[i]) % infin;
87 | }
88 | cout< 0.95
129 | assert result.mean_acc_v1 > 0.95
130 |
131 | pprint(result.tc2time_v0)
132 | pprint(result.tc2time_v1)
133 |
134 | print(
135 | f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
136 | print(
137 | f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
138 | print(
139 | f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001026564396")
140 | print(
141 | f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001029346032")
142 |
143 | assert result.tc2time_v0[0] == 0.001035073468
144 | assert result.tc2time_v0[1] == 0.001039205596
145 |
146 | assert result.tc2time_v1[0] == 0.001026564396
147 | assert result.tc2time_v1[1] == 0.001029346032
148 |
149 | hyperfine_v0_tc2stats = result.tc2stats_binary_v0
150 | hyperfine_v1_tc2stats = result.tc2stats_binary_v1
151 |
152 | for tc, time in hyperfine_v0_tc2stats.items():
153 | tc2hyperfine_v0[tc].append(np.array(time))
154 | for tc, time in hyperfine_v1_tc2stats.items():
155 | tc2hyperfine_v1[tc].append(np.array(time))
156 |
157 | for tc, times_v0 in tc2hyperfine_v0.items():
158 | mean_times_v0 = []
159 | for time_list in times_v0:
160 | mean_times_v0.append(np.mean(time_list))
161 | mean_times_v1 = []
162 | for time_list in tc2hyperfine_v1[tc]:
163 | mean_times_v1.append(np.mean(time_list))
164 | # consistency check
165 | assert (np.std(mean_times_v0) / np.mean(mean_times_v0)
166 | ) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
167 | assert (np.std(mean_times_v1) / np.mean(mean_times_v1)
168 | ) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
169 | # performance check
170 | assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)
171 | ) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
172 | print(
173 | f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
174 | print(
175 | f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
176 | print(
177 | f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
178 |
179 | assert len(tc2hyperfine_v0) == 2
180 | assert len(tc2hyperfine_v1) == 2
181 |
182 |
183 | def test_single_submission(self, get_pie_env):
184 | env = get_pie_env
185 | tc2hyperfine = defaultdict(list)
186 | for _ in range(2):
187 | result = env.submit_single_submission(code=example_1_code,
188 | testcases=[0,1],
189 | problem_id=example_1_problem_id,
190 | timing_env="both")
191 |
192 | assert result.compilation == True
193 | assert result.tc2success[0] == True
194 | assert result.tc2success[1] == True
195 | assert result.tc2time[0] == 0.001035073468
196 | assert result.tc2time[1] == 0.001039205596
197 | assert result.mean_acc > 0.95
198 |
199 | hyperfine_result = result.tc2stats_binary
200 |
201 | for tc, results in hyperfine_result.items():
202 | tc2hyperfine[tc].append(np.array(results))
203 |
204 | for tc, times in tc2hyperfine.items():
205 | mean_times = []
206 | for time_list in times:
207 | mean_times.append(np.mean(time_list))
208 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
209 | assert len(tc2hyperfine) == 2
210 |
211 |
212 |
213 | def test_dual_submission_diff_code(self, get_pie_env):
214 | env = get_pie_env
215 | tc2hyperfine_v0 = defaultdict(list)
216 | tc2hyperfine_v1 = defaultdict(list)
217 | for _ in range(2):
218 | result = env.submit_single_submission_pair(code_v0=example_1_code,
219 | code_v1=example_2_code,
220 | testcases=[0,1],
221 | problem_id=example_1_problem_id,
222 | timing_env="both")
223 |
224 |
225 | assert result.compilation_v0 == True
226 | assert result.compilation_v1 == True
227 |
228 | assert result.mean_acc_v0 > 0.95
229 | assert result.mean_acc_v1 > 0.95
230 |
231 | pprint(result.tc2time_v0)
232 | pprint(result.tc2time_v1)
233 |
234 | print(f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
235 | print(f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
236 | print(f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001026564396")
237 | print(f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001029346032")
238 |
239 | assert result.tc2time_v0[0] == 0.001035073468
240 | assert result.tc2time_v0[1] == 0.001039205596
241 |
242 | assert result.tc2time_v1[0] == 0.001026564396
243 | assert result.tc2time_v1[1] == 0.001029346032
244 |
245 | hyperfine_v0_tc2stats = result.tc2stats_binary_v0
246 | hyperfine_v1_tc2stats = result.tc2stats_binary_v1
247 |
248 | for tc, time in hyperfine_v0_tc2stats.items():
249 | tc2hyperfine_v0[tc].append(np.array(time))
250 | for tc, time in hyperfine_v1_tc2stats.items():
251 | tc2hyperfine_v1[tc].append(np.array(time))
252 |
253 | for tc, times_v0 in tc2hyperfine_v0.items():
254 | mean_times_v0 = []
255 | for time_list in times_v0 :
256 | mean_times_v0.append(np.mean(time_list))
257 | mean_times_v1 = []
258 | for time_list in tc2hyperfine_v1[tc] :
259 | mean_times_v1.append(np.mean(time_list))
260 | # consistency check
261 | assert (np.std(mean_times_v0) / np.mean(mean_times_v0)) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
262 | assert (np.std(mean_times_v1) / np.mean(mean_times_v1)) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
263 | # performance check
264 | assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
265 | print(f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
266 | print(f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
267 | print(f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
268 |
269 | assert len(tc2hyperfine_v0) == 2
270 | assert len(tc2hyperfine_v1) == 2
271 |
272 |
273 | def test_dual_submission_same_code(self, get_pie_env):
274 | env = get_pie_env
275 | tc2hyperfine_v0 = defaultdict(list)
276 | tc2hyperfine_v1 = defaultdict(list)
277 | for _ in range(2):
278 | result = env.submit_single_submission_pair(code_v0=example_1_code,
279 | code_v1=example_1_code,
280 | testcases=[0,1],
281 | problem_id=example_1_problem_id,
282 | timing_env="both")
283 |
284 |
285 | assert result.compilation_v0 == True
286 | assert result.compilation_v1 == True
287 |
288 | assert result.mean_acc_v0 > 0.95
289 | assert result.mean_acc_v1 > 0.95
290 |
291 | pprint(result.tc2time_v0)
292 | pprint(result.tc2time_v1)
293 |
294 | print(f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
295 | print(f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
296 | print(f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001035073468")
297 | print(f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001039205596")
298 |
299 | assert result.tc2time_v0[0] == 0.001035073468
300 | assert result.tc2time_v0[1] == 0.001039205596
301 |
302 | assert result.tc2time_v1[0] == 0.001035073468
303 | assert result.tc2time_v1[1] == 0.001039205596
304 |
305 | hyperfine_v0_tc2stats = result.tc2stats_binary_v0
306 | hyperfine_v1_tc2stats = result.tc2stats_binary_v1
307 |
308 | for tc, time in hyperfine_v0_tc2stats.items():
309 | tc2hyperfine_v0[tc].append(np.array(time))
310 | for tc, time in hyperfine_v1_tc2stats.items():
311 | tc2hyperfine_v1[tc].append(np.array(time))
312 |
313 | for tc, times_v0 in tc2hyperfine_v0.items():
314 | times_v1 = tc2hyperfine_v1[tc]
315 | mean_times = []
316 | for time_list in times_v0 + times_v1:
317 | mean_times.append(np.mean(time_list))
318 | assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
319 | print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
320 | assert len(tc2hyperfine_v0) == 2
321 |
322 |
323 |
324 | def test_multiple_single_submissions(self, get_pie_env):
325 |
326 |
327 | code_list = [example_1_code, example_2_code] * 3
328 | testcases_list = [[0, 1], [0, 1]] * 3
329 | problem_id_list = [example_1_problem_id, example_2_problem_id] * 3
330 | override_flags_list = ["", ""] * 3
331 |
332 | env = get_pie_env
333 |
334 | results = env.submit_multiple_single_submissions(code_list=code_list,
335 | testcases_list=testcases_list,
336 | problem_id_list=problem_id_list,
337 | override_flags_list=override_flags_list,
338 | timing_env="both")
339 |
340 | tc2hyperfine_v0 = defaultdict(list)
341 | tc2hyperfine_v1 = defaultdict(list)
342 |
343 | for i, result in enumerate(results):
344 | assert result.compilation == True
345 | assert result.tc2success[0] == True
346 | assert result.tc2success[1] == True
347 |
348 | hyperfine_result = result.tc2stats_binary
349 |
350 | if (i % 2) == 0:
351 | assert result.tc2time[0] == 0.001035073468
352 | assert result.tc2time[1] == 0.001039205596
353 | tc2hyperfine = tc2hyperfine_v0
354 | else:
355 | assert result.tc2time[0] == 0.001026564396
356 | assert result.tc2time[1] == 0.001029346032
357 | tc2hyperfine = tc2hyperfine_v1
358 |
359 | for tc, results in hyperfine_result.items():
360 | tc2hyperfine[tc].append(np.array(results))
361 |
362 | for tc, times_v0 in tc2hyperfine_v0.items():
363 | mean_times_v0 = []
364 | for time_list in times_v0 :
365 | mean_times_v0.append(np.mean(time_list))
366 | mean_times_v1 = []
367 | for time_list in tc2hyperfine_v1[tc] :
368 | mean_times_v1.append(np.mean(time_list))
369 |
370 | print(f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
371 | print(f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
372 | print(f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
373 |
374 | # consistency check
375 | assert (np.std(mean_times_v0) / np.mean(mean_times_v0)) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
376 | assert (np.std(mean_times_v1) / np.mean(mean_times_v1)) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
377 | # performance check
378 | assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
379 |
380 |
381 | assert len(tc2hyperfine_v0) == 2
382 | assert len(tc2hyperfine_v1) == 2
--------------------------------------------------------------------------------
/gem5/template_config.yaml:
--------------------------------------------------------------------------------
1 | model_generated_outputs_path: "PATH_TO_YOUR_OUTPUTS"
2 | reference_file_path: "PATH_TO_REFERENCE_FILE_JSONL" # The path to the reference file. This should be the reference .jsonl file containing the reference outputs in addition to all other metadata in the test set file.
3 | output_dir: "PATH_TO_DIRECTORY_WHERE_YOU_WANT_TO_SAVE_EVALUATION_RESULTS"
4 | is_prompt_based: False # should always be False
5 | cpus_available: -1
6 | model_generated_potentially_faster_code_col: "generated_answers" # column in the model-generated outputs that contains the generated code, it should be a list of strings
7 | num_problems_to_evaluate: -1 # -1 means evaluate all problems
8 |
--------------------------------------------------------------------------------
/openai_finetuning/README.md:
--------------------------------------------------------------------------------
1 | The script `finetune_openai.py` was used to finetune GPT3.5 Turbo. Its usage is as follows:
2 |
3 | ```bash
4 | python finetune_openai.py PATH_TO_CONFIG.yaml
5 | ```
6 |
7 | We've included a sample config file `config.yaml` in this directory. The config file should contain the following fields:
8 |
9 | ```yaml
10 | api_key: "YOUR_OPENAI_API_KEY"
11 | organization: "YOUR_OPENAI_ORGANIZATION (optional)"
12 | input_train_path: "PATH_TO_TRAINING_DATA"
13 | input_test_path: "PATH_TO_VALIDATION_DATA"
14 | max_train: -1
15 | max_val: -1
16 | max_len: -1
17 | epochs: NUMBER_OF_EPOCHS (we used 1)
18 | output_dir: "PATH_TO_OUTPUT_DIR"
19 | model_suffix: "SUFFIX_FOR_MODEL_NAME"
20 | ```
21 |
--------------------------------------------------------------------------------
/openai_finetuning/finetune_openai.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import os
3 | import sys
4 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
5 | import pie_chatgpt
6 | import re
7 | import json
8 | from typing import List, Dict
9 | import yaml
10 | import logging
11 | import shutil
12 | import uuid
13 | import time
14 | import json
15 | import os
16 | from time import sleep
17 | from io import StringIO
18 | import openai
19 |
20 |
21 |
22 | def load_data(train_path, test_path, max_train, max_val):
23 | df_train = pd.read_json(train_path, lines=True, orient='records')
24 | df_train = df_train.sample(frac=1).reset_index(drop=True)
25 | df_train = df_train[:max_train]
26 | df_test = pd.read_json(test_path, lines=True, orient='records')
27 | df_test = df_test.sample(frac=1).reset_index(drop=True)
28 | df_test = df_test[:max_val]
29 | return df_train, df_test
30 |
31 |
32 |
33 | def prepare_output(code_str, max_len=-1, tokenizer=None):
34 | # "\n+" -> "\n"
35 | if max_len > 0 and tokenizer:
36 | code_str = code_str[:max_len]
37 | elif max_len > 0 and not tokenizer:
38 | raise ValueError("max_len > 0 but no tokenizer provided")
39 | return code_str
40 |
41 |
42 | def prepare_dataset(df, src_code_col, tgt_code_col, max_len=-1, tokenizer=None, max_examples=-1):
43 | df = df.copy()
44 | if max_examples > 0:
45 | df = df.sample(frac=1).reset_index(drop=True)
46 | df = df[:max_examples]
47 | training_examples = []
48 | for i, row in df.iterrows():
49 | src_code = row[src_code_col]
50 | src_code_formatted = pie_chatgpt.ChatGPTWrapper.prepare_input(src_code)
51 | tgt_code = row[tgt_code_col]
52 | tgt_code_formatted = prepare_output(tgt_code, max_len=max_len, tokenizer=tokenizer)
53 |
54 | d = [
55 | {"role": "system", "content": "You are a helpful assistant that can optimize code."},
56 | {"role": "user", "content": src_code_formatted},
57 | {"role": "assistant", "content": tgt_code_formatted},
58 | ]
59 | training_examples.append({"messages": d})
60 | return training_examples
61 |
62 |
63 |
64 | def save_dataset(training_examples: List[Dict], file_name: str):
65 | with open(file_name, 'w') as jsonl_file:
66 | for example in training_examples:
67 | jsonl_file.write(json.dumps(example) + '\n')
68 |
69 |
70 | def register_file_openai(file_path, outpath, sleep_interval=30):
71 | logger.info(f"Registering file {file_path} to OpenAI")
72 | file_dict = openai.File.create(
73 | file=open(file_path, "rb"),
74 | purpose='fine-tune',
75 | )
76 | logger.info(f"File registered with id {file_dict['id']}")
77 | while file_dict['status'] != 'processed':
78 | file_dict = openai.File.retrieve(file_dict['id'])
79 | logger.info(f"File status: {file_dict['status']}")
80 | with open(outpath, 'w') as json_file:
81 | json.dump(file_dict, json_file)
82 | if file_dict['status'] != 'processed':
83 | logger.info(f"Sleeping for {sleep_interval} seconds")
84 | sleep(sleep_interval)
85 | return file_dict
86 |
87 |
88 | def main(input_train_path, input_test_path, max_train, max_val, max_len, tokenizer,output_dir, model_suffix="pie_opt", epochs=1):
89 | logging.info(f"Input train path: {input_train_path}; epochs: {epochs}")
90 | if not os.path.exists(output_dir):
91 | os.makedirs(output_dir)
92 | df_train, df_test = load_data(input_train_path, input_test_path, max_train, max_val)
93 | logger.info(f"Loaded {len(df_train)} training examples and {len(df_test)} test examples")
94 | training_examples = prepare_dataset(df_train, "src_code", "tgt_code", max_len=max_len, tokenizer=tokenizer)
95 | if os.path.exists(os.path.join(output_dir, "train.jsonl")):
96 | unique_id = uuid.uuid4()
97 | logger.warning(f"File {os.path.join(output_dir, 'train.jsonl')} already exists, copying to {os.path.join(output_dir, f'train_{unique_id}.jsonl')}")
98 | shutil.copy(os.path.join(output_dir, "train.jsonl"), os.path.join(output_dir, f"train_{unique_id}.jsonl"))
99 | save_dataset(training_examples, os.path.join(output_dir, "train.jsonl"))
100 | training_examples = prepare_dataset(df_test, "src_code", "tgt_code", max_len=max_len, tokenizer=tokenizer)
101 | if os.path.exists(os.path.join(output_dir, "test.jsonl")):
102 | unique_id = uuid.uuid4()
103 | logger.warning(f"File {os.path.join(output_dir, 'test.jsonl')} already exists, copying to {os.path.join(output_dir, f'test_{unique_id}.jsonl')}")
104 | shutil.copy(os.path.join(output_dir, "test.jsonl"), os.path.join(output_dir, f"test_{unique_id}.jsonl"))
105 | save_dataset(training_examples, os.path.join(output_dir, "test.jsonl"))
106 | train_data = register_file_openai(os.path.join(output_dir, "train.jsonl"), os.path.join(output_dir, "openai_train_file.json"))
107 | val_data = register_file_openai(os.path.join(output_dir, "test.jsonl"), os.path.join(output_dir, "openai_val_file.json"))
108 | train_data, val_data = wait_on_data(train_data, val_data)
109 | assert train_data['status'] == 'processed'
110 | assert val_data['status'] == 'processed'
111 | with open(os.path.join(output_dir, "openai_train_file.json"), 'w') as train_json_file, open(os.path.join(output_dir, "openai_val_file.json"), 'w') as val_json_file:
112 | json.dump(train_data, train_json_file)
113 | json.dump(val_data, val_json_file)
114 |
115 | model = openai.FineTuningJob.create(
116 | model = "gpt-3.5-turbo",
117 | training_file = train_data['id'],
118 | validation_file = val_data['id'],
119 | suffix = model_suffix,
120 | hyperparameters = {"n_epochs": epochs}
121 | )
122 | logging.info(f"Model {model['id']} created")
123 | logging.info(f"Model dict: {model}")
124 | monitor_model(model, output_dir)
125 | return model
126 |
127 | def wait_on_data(train_data, val_data, max_timeout = 600, sleep_interval=10):
128 | start = time.time()
129 | while train_data['status'] != 'processed' or val_data['status'] != 'processed':
130 | train_data = openai.File.retrieve(train_data['id'])
131 | val_data = openai.File.retrieve(val_data['id'])
132 | logger.info(f"Train data status: {train_data['status']} status_details: {train_data['status_details']}")
133 | logger.info(f"Val data status: {val_data['status']}, status_details: {val_data['status_details']}")
134 | if time.time() - start > max_timeout:
135 | raise TimeoutError("Timeout waiting for data")
136 | logger.info(f"Sleeping for {sleep_interval} seconds")
137 | sleep(sleep_interval)
138 | return train_data, val_data
139 |
140 |
141 | def get_step_metrics(file_id):
142 | content = openai.File.download(file_id)
143 | eval_result = StringIO(content.decode())
144 | df = pd.read_csv(eval_result, sep=",")
145 | return df
146 |
147 |
148 | def handle_get_step_metrics(file_id, output_dir):
149 | content = openai.File.download(file_id)
150 | eval_result = StringIO(content.decode())
151 | try:
152 | df = pd.read_csv(eval_result, sep=",")
153 | df.to_csv(os.path.join(output_dir, f"success_{file_id}.csv"), index=False)
154 | return df
155 | except Exception as e:
156 | error_message = f"Error reading file {file_id}: {e}\n"
157 | file_content_message = f"File content: {content}\n"
158 | file_content_decoded_message = f"File content decoded: {content.decode()}\n"
159 | eval_result_content_message = f"Eval result content: {eval_result.getvalue()}\n"
160 |
161 | with open(os.path.join(output_dir, f"error_{file_id}.txt"), 'w') as error_file:
162 | error_file.write(error_message)
163 | error_file.write(file_content_message)
164 | error_file.write(file_content_decoded_message)
165 | error_file.write(eval_result_content_message)
166 |
167 | logger.error(error_message)
168 | logger.error(file_content_message)
169 | logger.error(file_content_decoded_message)
170 | logger.error(eval_result_content_message)
171 |
172 | return None
173 |
174 | SAMPLE_CPP_PROGRAM_TO_OPTIMIZE = """
175 | #include
176 | #include
177 | #include
178 | #include
179 |
180 | int main(int argc, char** argv) {
181 | int n = 1000000;
182 | int* a = (int*) malloc(n * sizeof(int));
183 | int* b = (int*) malloc(n * sizeof(int));
184 | int* c = (int*) malloc(n * sizeof(int));
185 | for (int i = 0; i < n; i++) {
186 | a[i] = i;
187 | b[i] = i;
188 | }
189 | for (int i = 0; i < n; i++) {
190 | c[i] = a[i] + b[i];
191 | }
192 | printf("%d", c[0]);
193 | free(a);
194 | free(b);
195 | free(c);
196 | return 0;
197 | }
198 | """
199 |
200 |
201 |
202 |
203 | def monitor_model(model_dict, output_dir, sleep_interval=30):
204 | model = openai.FineTuningJob.retrieve(model_dict['id'])
205 | logger.info(f"Model status: {model['status']}")
206 | while model['status'] != 'succeeded':
207 | model = openai.FineTuningJob.retrieve(model_dict['id'])
208 | logger.info(f"Model status: {model['status']}")
209 | if model['status'] != 'succeeded':
210 | logger.info(f"Sleeping for {sleep_interval} seconds")
211 | if "result_files" in model:
212 | for file_id in model['result_files']:
213 | if file_id != None:
214 | result = openai.File.download(file_id)
215 | with open(os.path.join(output_dir, f"result_{file_id}.csv"), 'wb') as result_file:
216 | result_file.write(result)
217 | logging.info(f"Result file {file_id} saved to {os.path.join(output_dir, f'result_{file_id}.json')}")
218 | try:
219 | df = pd.read_csv(os.path.join(output_dir, f"result_{file_id}.csv"))
220 | last_row = df.iloc[-1]
221 | logger.info(f"Last row: {last_row}")
222 | except Exception as e:
223 | logger.error(f"Error reading file {file_id}: {e}")
224 | logger.error(f"File content: {result}")
225 | logger.error(f"File content decoded: {result.decode()}")
226 |
227 | with open(os.path.join(output_dir, "openai_model.json"), 'w') as json_file:
228 | json.dump(model, json_file)
229 | sleep(sleep_interval)
230 |
231 | if "result_files" in model:
232 | for file_id in model['result_files']:
233 | if file_id is not None:
234 | result = openai.File.download(file_id)
235 | with open(os.path.join(output_dir, f"result_{file_id}.csv"), 'wb') as result_file: # 'wb'
236 | result_file.write(result)
237 | logging.info(f"Result file {file_id} saved to {os.path.join(output_dir, f'result_{file_id}.json')}")
238 |
239 | with open(os.path.join(output_dir, "openai_model.json"), 'w') as json_file:
240 | json.dump(model, json_file)
241 |
242 | # parse the clock time
243 | # finished_at = model['finished_at']
244 | # started_at = model['started_at']
245 | # total_time = finished_at - started_at
246 | finished_at = model.get('finished_at', None)
247 | started_at = model.get('started_at', None)
248 | if finished_at is not None and started_at is not None:
249 | total_time = finished_at - started_at
250 | logging.info(f"Model {model['id']} finished in {total_time / 60} minutes")
251 | if "trained_tokens" in model:
252 | logging.info(f"Model {model['id']} trained tokens: {model['trained_tokens']}")
253 |
254 | logging.info(f"Model {model['id']} fine-tuned model: {model['fine_tuned_model']}")
255 |
256 |
257 | chat_log = [
258 | {"role": "system", "content": "You are a helpful assistant that can optimize code."},
259 | {"role": "user", "content": pie_chatgpt.ChatGPTWrapper.prepare_input(SAMPLE_CPP_PROGRAM_TO_OPTIMIZE)},
260 | ]
261 |
262 | try:
263 | response = openai.ChatCompletion.create(
264 | model=model['fine_tuned_model'],
265 | messages=chat_log,
266 | max_tokens=1000,
267 | temperature=0.0,
268 | )
269 | logging.info(f"************************")
270 | logging.info(f"Input program: {SAMPLE_CPP_PROGRAM_TO_OPTIMIZE}")
271 | logging.info("************************")
272 | logging.info(f"Output program: {response['choices'][0]['message']['content']}")
273 | except Exception as e:
274 | logging.error(f"Error calling OpenAI API: {e}")
275 | logging.error(f"Chat log: {chat_log}")
276 |
277 | return model
278 |
279 |
280 | def load_config(yaml_path):
281 | with open(yaml_path, 'r') as file:
282 | config = yaml.safe_load(file)
283 | return config
284 |
285 |
286 |
287 | if __name__ == "__main__":
288 | import transformers
289 | tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
290 |
291 | if len(sys.argv) > 1:
292 | config_path = sys.argv[1]
293 | else:
294 | raise ValueError("No config path provided")
295 | config = load_config(config_path)
296 |
297 | openai.api_key = config['api_key']
298 | if 'organization' in config and config['organization']:
299 | openai.organization = config['organization']
300 |
301 | assert len(config['model_suffix']) > 0 and len(config['model_suffix']) < 19, "model_suffix must be between 1 and 18 characters"
302 |
303 | logger = logging.getLogger(__name__)
304 | ## log date and time
305 | if not os.path.exists(config['output_dir']):
306 | os.makedirs(config['output_dir'])
307 | logging.basicConfig(
308 | level=logging.INFO,
309 | format='%(asctime)s %(message)s',
310 | handlers=[
311 | logging.FileHandler(os.path.join(config['output_dir'], 'chatgpt_fine_tuning.log')),
312 | logging.StreamHandler()
313 | ]
314 | )
315 |
316 | logging.info(f"Config: {config}")
317 |
318 | main(
319 | input_train_path=config['input_train_path'],
320 | input_test_path=config['input_test_path'],
321 | max_train=config['max_train'],
322 | max_val=config['max_val'],
323 | max_len=config['max_len'],
324 | tokenizer=tokenizer,
325 | output_dir=config['output_dir'],
326 | model_suffix=config['model_suffix'],
327 | epochs=config['epochs']
328 | )
329 |
330 |
--------------------------------------------------------------------------------
/openai_finetuning/openai_config.yaml:
--------------------------------------------------------------------------------
1 | api_key: ""
2 | organization: ""
3 | input_train_path: ""
4 | input_test_path: ""
5 | max_train: -1
6 | max_val: -1
7 | max_len: -1
8 | epochs: 1
9 | output_dir: ""
10 | model_suffix: ""
--------------------------------------------------------------------------------
/openai_finetuning/pie_chatgpt.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import openai
3 | import random
4 | import tiktoken
5 | from tqdm import tqdm
6 | from typing import List
7 | from concurrent.futures import ThreadPoolExecutor
8 |
9 | random.seed(42)
10 |
11 |
12 | def retry_with_exponential_backoff(
13 | func,
14 | initial_delay: float = 1,
15 | exponential_base: float = 2,
16 | jitter: bool = True,
17 | max_retries: int = 10,
18 | errors: tuple = (
19 | openai.error.RateLimitError,
20 | openai.error.ServiceUnavailableError,
21 | ),
22 | ):
23 | """Retry a function with exponential backoff."""
24 |
25 | def wrapper(*args, **kwargs):
26 | # Initialize variables
27 | num_retries = 0
28 | delay = initial_delay
29 |
30 | # Loop until a successful response or max_retries is hit or an exception is raised
31 | while True:
32 | try:
33 | return func(*args, **kwargs)
34 |
35 | # Retry on specified errors
36 | except errors as e:
37 | # Increment retries
38 | num_retries += 1
39 |
40 | # Check if max retries has been reached
41 | if num_retries > max_retries:
42 | raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
43 |
44 | # Increment the delay
45 | delay *= exponential_base * (1 + jitter * random.random())
46 |
47 | # Sleep for the delay
48 | time.sleep(delay)
49 | print(f"\nRetrying after {delay:.2f} seconds.")
50 |
51 | # Raise exceptions for any errors not specified
52 | except Exception as e:
53 | raise e
54 |
55 | return wrapper
56 |
57 |
58 | class ChatGPTWrapper:
59 | """A Wrapper for ChatGPT model interaction."""
60 |
61 | @staticmethod
62 | def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
63 | """
64 | Calculate the number of tokens in a text string.
65 |
66 | Args:
67 | - string (str): The text string to be tokenized.
68 | - encoding_name (str, optional): The encoding name for tokenization. Defaults to "cl100k_base".
69 | Returns:
70 | - int: Number of tokens in the string.
71 | """
72 | encoding = tiktoken.get_encoding(encoding_name)
73 | num_tokens = len(encoding.encode(string))
74 | return num_tokens
75 |
76 | @staticmethod
77 | @retry_with_exponential_backoff
78 | def call_openai_api(
79 | slow_code_str: str, max_tokens: int = 1024, temperature: float = 0.0
80 | ) -> str:
81 | """
82 | Calls the OpenAI API to optimize a given code.
83 |
84 | Args:
85 | - slow_code_str (str): The code string that needs to be optimized.
86 |
87 | - max_tokens (int, optional): The maximum number of tokens to be used for generation. Defaults to 1024.
88 |
89 | - temperature (float, optional): The temperature value for generation. Defaults to 0.0.
90 |
91 | Returns:
92 | - str: Optimized code returned by the OpenAI API.
93 | """
94 | # Initialize the chat log with system and user inputs
95 | start_chat_log = [
96 | {"role": "system", "content": "You are a helpful assistant that can optimize code."},
97 | {"role": "user", "content": ChatGPTWrapper.prepare_input(slow_code_str)},
98 | ]
99 | # Call the OpenAI API with the given chat log
100 | response = openai.ChatCompletion.create(
101 | model="gpt-3.5-turbo-0613",
102 | messages=start_chat_log,
103 | max_tokens=max_tokens,
104 | temperature=temperature,
105 | )
106 | # Extract the optimized code from the response
107 | return response["choices"][0]["message"]["content"]
108 |
109 | @staticmethod
110 | def prepare_input(slow_code_str: str) -> str:
111 | """
112 | Prepares the input for the OpenAI API by framing the code to be optimized.
113 |
114 | Args:
115 | - slow_code_str (str): The code string that needs to be framed for optimization.
116 |
117 | Returns:
118 | - str: Formatted input for the OpenAI API.
119 | """
120 | prompt = f"""// slower version::
121 |
122 | {slow_code_str}
123 |
124 | // optimized version of the same code:
125 |
126 | """
127 | return prompt
128 |
129 |
130 | QUESTION_PREFIX = "# slower version:\n\n"
131 | ANSWER_PREFIX = "# optimized version of the same code:\n\n"
132 |
133 |
134 |
135 | def main(input_file: str, output_file: str):
136 | # Read the jsonl file using pandas
137 | df = pd.read_json(input_file, lines=True)
138 |
139 | # Ensure src_code is in the dataframe
140 | if 'src_code' not in df.columns:
141 | raise ValueError("'src_code' column not found in the input file.")
142 |
143 | # Optimize code using multiple threads
144 | df['optimized_code'] = optimize_code_parallel(df['src_code'].tolist())
145 |
146 | # Save the dataframe to a new jsonl file
147 | df.to_json(output_file, orient='records', lines=True)
148 |
149 |
150 | def optimize_code_parallel(code_list: List[str], max_workers: int = 5) -> List[str]:
151 | """
152 | Function to optimize code using multiple threads.
153 |
154 | Args:
155 | - code_list (List[str]): List of code strings to optimize.
156 | - max_workers (int): Number of worker threads.
157 |
158 | Returns:
159 | - List[str]: List of optimized code strings.
160 | """
161 | with ThreadPoolExecutor(max_workers=max_workers) as executor:
162 | optimized_code_list = list(tqdm(executor.map(ChatGPTWrapper.call_openai_api, code_list), total=len(code_list)))
163 | return optimized_code_list
164 |
165 | if __name__ == "__main__":
166 | import sys
167 |
168 | if len(sys.argv) != 3:
169 | print("Usage: python pie_chatgpt.py ")
170 | sys.exit(1)
171 | main(input_file=sys.argv[1], output_file=sys.argv[2])
--------------------------------------------------------------------------------
/retrieval/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Retrieval
2 |
3 | A notebook that can be used to prepare the retrieval dataset is `retrieval.ipynb`. Given a training dataset and the test set examples to optimize, it will retrieve the K most similar training examples pairs for the given test set examples. The retrieved pairs are then used to prompt the model for optimized outputs.
--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | The file `sample_and_eval.py` contains an example of how to chain together sampling from a huggingface model and then use those generations for evaluation. It takes in a yaml file in the form like `template_config.yaml`. We caution the use of this as during our work, we ran all parts separately, and this example is mainly for illustrative purposes. It is possible the code has errors, because we did not run it in this form.
--------------------------------------------------------------------------------
/scripts/sample_and_eval.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import time
3 | import logging
4 | import sys
5 | import yaml
6 | import shutil
7 | import os
8 |
9 | def start_generation_container(model, volume, max_best_of, port=4242, startup_timeout=600):
10 | # command = f"docker run --detach --gpus all --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
11 | # with 1,2,3,4,5,6,7 gpus
12 | if not model.startswith("codellama"):
13 | model = f"data/{model}"
14 | # the first command may be
15 | command = f"docker run --detach --gpus 1,2,3,4,5,6,7 --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
16 | # use the following line for podman or potentially for a different docker installation, the nvidia-docker command may vary
17 | # command = f"docker run --detach -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
18 | container_id = subprocess.check_output(command, shell=True).decode().strip()
19 | # wait until the logs say Connected
20 | while True:
21 | logging.info(f"Waiting for container to start with id {container_id} and timeout {startup_timeout} left")
22 | logs = subprocess.check_output(f"docker logs {container_id}", shell=True).decode()
23 | if "Connected" in logs:
24 | break
25 | time.sleep(5)
26 | startup_timeout -= 5
27 | if startup_timeout <= 0:
28 | raise TimeoutError("Timeout waiting for container to start")
29 | return container_id
30 |
31 | def stop_generation_container(container_id):
32 | subprocess.run(f"docker stop {container_id}", shell=True)
33 |
34 | def remove_generation_container(container_id):
35 | subprocess.run(f"docker rm {container_id}", shell=True)
36 |
37 |
38 | def sample_from_container(test_file, output_file, do_sample, num_samples=8, max_new_tokens=1000, temperature=0.7, num_threads=20, prompt_name="code_opt"):
39 | logging.info(f"Sampling from container with test_file {test_file} and output_file {output_file}")
40 | command = f"python finetuning/sample.py --test_file {test_file} --output_file {output_file} --do_sample {do_sample} --num_samples {num_samples} --max_new_tokens {max_new_tokens} --temperature {temperature} --num_threads {num_threads} --prompt_name {prompt_name}"
41 | logging.info(f"Running command {command}")
42 | p = subprocess.run(command, shell=True)
43 | logging.info(f"sample.py returned with code {p.returncode}")
44 | return p.returncode
45 |
46 | def run_eval(eval_args):
47 | eval_args["model_generated_outputs_path"] = sampling_args["output_file"]
48 | eval_output_dir = eval_args["output_dir"]
49 | if not os.path.exists(eval_output_dir):
50 | os.makedirs(eval_output_dir)
51 | else:
52 | logging.warning(f"Output directory {eval_output_dir} already exists, overwriting")
53 | with open(os.path.join(eval_output_dir, "config.yaml"), "w") as f:
54 | yaml.dump(eval_args, f)
55 | logging.info(f"Running eval with args {eval_args}")
56 | cmd = f"python gem5/gem5_eval.py --config_path {os.path.join(eval_output_dir, 'config.yaml')}"
57 | logging.info(f"Running command {cmd}")
58 | p = subprocess.run(cmd, shell=True)
59 | logging.info(f"gem5_eval.py returned with code {p.returncode}")
60 | logging.info("Done")
61 |
62 |
63 | def main():
64 | cfg_path = sys.argv[1]
65 | with open(cfg_path, 'r') as f:
66 | cfg = yaml.load(f)
67 | text_gen_args = cfg["text_gen_args"]
68 | sampling_args = cfg["sampling_args"]
69 | eval_args = cfg["eval_args"]
70 |
71 | # Check if the output directory for evaluation exists
72 | if os.path.exists(eval_args['output_dir']):
73 | logging.info(f"Output directory {eval_args['output_dir']} already exists. Skipping the entire script.")
74 | return
75 |
76 | # Check if the output file from sampling exists
77 | if os.path.exists(sampling_args['output_file']):
78 | logging.info(f"Output file {sampling_args['output_file']} from sampling already exists. Skipping container startup and sampling.")
79 | else:
80 | # Start the container and perform sampling
81 | logging.info(f"Starting generation container with args {text_gen_args}")
82 | container_id = start_generation_container(text_gen_args["generation_model_name"], text_gen_args["volume_mount"], text_gen_args["max_best_of"], port=text_gen_args["port"])
83 | logging.info(f"Sampling from container with args {sampling_args}")
84 | sample_from_container(**sampling_args)
85 | # Stop and remove the container
86 | logging.info(f"Stopping container with id {container_id}")
87 | stop_generation_container(container_id)
88 | logging.info(f"Removing container with id {container_id}")
89 | remove_generation_container(container_id)
90 | logging.info("Successfully removed container")
91 |
92 | # Run evaluation
93 | logging.info(f"Setting model_generated_outputs_path to {sampling_args['output_file']} and running eval with args {eval_args}")
94 | run_eval(eval_args)
95 |
96 |
97 | if __name__ == "__main__":
98 | main()
99 |
100 |
101 |
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/scripts/template_config.yaml:
--------------------------------------------------------------------------------
1 | text_gen_args:
2 | generation_model_name: "your_model_name"
3 | volume_mount: "/path/to/your/volume"
4 | max_best_of: 5
5 | port: 4242
6 |
7 | sampling_args:
8 | test_file: "/path/to/your/test_file"
9 | output_file: "/path/to/your/output_file"
10 | do_sample: true
11 | num_samples: 8
12 | max_new_tokens: 1000
13 | temperature: 0.7
14 | num_threads: 20
15 | prompt_name: "code_opt"
16 |
17 | eval_args:
18 | output_dir: "/path/to/your/evaluation_output_directory"
19 | is_prompt_based: false
20 | cpus_available: -1
21 | model_generated_potentially_faster_code_col: "generated_answers"
22 | num_problems_to_evaluate: -1
23 |
--------------------------------------------------------------------------------