├── LICENSE ├── README.md ├── data_augmentation ├── README.md ├── data_augmentation.py ├── data_augmentation_driver_final.sh └── data_augmentation_driver_search.sh ├── docs ├── PIE_experiments_readme.md └── images │ ├── 38_animation.gif │ ├── 46_animation.gif │ ├── arch.png │ ├── performance_conditioning.png │ └── self_play.drawio.png ├── finetuning ├── .gitignore ├── LICENSE ├── README.md ├── docker │ └── Dockerfile ├── finetune.py ├── sample.py ├── sample.sh ├── server.sh ├── templates │ ├── code_opt.json │ ├── code_opt_w_speedup_bin.json │ ├── code_opt_w_speedup_desc.json │ └── code_opt_w_speedup_pctile.json ├── tokenizer_files │ ├── 13B │ │ ├── special_tokens_map.json │ │ ├── tokenizer.model │ │ └── tokenizer_config.json │ └── 7B │ │ ├── special_tokens_map.json │ │ ├── tokenizer.model │ │ └── tokenizer_config.json ├── train.sh └── utils │ ├── convert_to_safetensors.py │ └── prompter.py ├── gem5 ├── README.md ├── __init__.py ├── api_pytest.py ├── benchmarking.py ├── gem5_api.py ├── gem5_eval.py ├── pytest_simulator.py ├── simulator.py └── template_config.yaml ├── openai_finetuning ├── README.md ├── finetune_openai.py ├── openai_config.yaml └── pie_chatgpt.py ├── retrieval ├── README.md └── retrieval.ipynb └── scripts ├── README.md ├── sample_and_eval.py └── template_config.yaml /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Performance-Improving Code Edits 2 | 3 | Repository for *Learning Performance-Improving Code Edits* ([paper](https://openreview.net/forum?id=ix7rLVHXyY), [website](https://pie4perf.com/)). 4 | 5 | 🚨 Benchmarking programs is easy; but benchmarking programs in a reproducible and deterministic manner is very hard. 6 | 7 | 🚨 LLMs are not great at program optimization out-of-the-box (at least for competitive programming problems) 8 | 9 | 10 | We perform extensive experiments to evaluate and improve Large Language Models (LLMs) for program optimization. We built a custom evaluation framework that benchmarks program execution time in a highly-reliable manner and we provide a dataset annotated with execution time information from our environment. 11 | 12 | When measuring average program speedup, we obtained a fine-tuned version of CodeLlama13B that outperforms GPT4 and the best human programmer. Using self-play for program optimization, we also obtain a fine-tuned version of GPT3.5 that is even stronger. 13 | 14 | 15 | 17 | 18 | Description of animation 19 | 20 | 21 | 23 | 24 | 25 | ## Dataset 26 | 27 | - PIE is based on [IBM CodeNet](https://github.com/IBM/Project_CodeNet). Huge thanks to the authors of CodeNet for making their curated dataset available! 28 | 29 | Our Train/Val/Test splits are located [here](https://drive.google.com/drive/folders/1E_yFqM8khN1HAH03OKhjheSlNI4rYTT7?usp=sharing). There is also a `train_with_synthetic.jsonl` file which contains and additional ~1.4K pairs generated via self-play. We also have subsets `train_hq_only.jsonl` and `train_hq_and_synthetic.jsonl` which contain only high-quality pairs and high-quality pairs + synthetic pairs respectively. 30 | 31 | Testcases: 32 | 33 | - [Merged test cases](https://drive.google.com/file/d/1evBDJapwRvCQK6VUCTV8ZE9WG2k3QJQr/view?usp=sharing) containing both public and generated test cases: these test cases were the ones used for experiments in the paper. 34 | - [Public test cases](https://drive.google.com/file/d/1RcUpZMOR8L2xYYWDZx7I0tHFzFgg7COO/view?usp=share_link). These test cases are sourced from IBM CodeNet. 35 | - [Generated test cases](https://drive.google.com/file/d/1migwX4wpED0gDDxn7gS6q55vWeXIDgId/view?usp=drive_link). These test cases are sourced from [alphacode](https://github.com/google-deepmind/code_contests). 36 | 37 | The column `tests` in the jsonl files will contain the indices which should be used for benchmarking models. 38 | 39 | ## Program Benchmarking with gem5 40 | 41 | Benchmarking programs is easy; but benchmarking programs in a reproducible and deterministic manner is very hard. 42 | It is important, because we want to compare the performance of different models on the same set of programs irrespective of 43 | a reserarcher's server configuration. Moreover, you can even wind up in scenarios where you can benchmark the same exact program and accidentally believe one is much faster than the other. 44 | 45 | gem5 46 | 47 | We built a custom evaluation framework that benchmarks program execution time in a highly-reliable manner. 48 | We built an execution sandbox based on the gem5 simulator. Given program termination/a program not timing out, benchmarking results are deterministic. 49 | For our experiments, we use a simulated CPU of the Intel Skylake CPU. 50 | We provide an easy-to-use docker image and API that can be used to reproduce our results and for other researchers to continue to use for program optimization research. 51 | 52 | Building the environment is similar to the [gym](https://github.com/Farama-Foundation/Gymnasium) API for reinforcement learning. After importing the module and running make, the docker image should automatically be pulled on the first iteration and a container created. The environment then provides a convenient abstraction for interacting with the environment. More information is located at [gem5](./gem5/README.md). 53 | 54 | It is possible that on a separate architecture, the gem5 simulator runs slower or faster then when we ran it, so results could be influenced by more-frequent and less-frequent timeouts. Generally this should affect programs on the threshold of timing out, 55 | and it should affect more-aggressive optimizations (often "better" models) less than less-aggressive optimizations. 56 | 57 | ```python 58 | import simulator 59 | 60 | # pulls the image from docker hub if it doesn't exist and sets up a connection with a running container 61 | env = simulator.make(arch='X86-skylake', optimization_flag='-O3') 62 | # example sending a program to benchmark within the environment 63 | gem5_benchmarking_results = env.submit_single_submission(...) 64 | ``` 65 | ## Performance-Conditioning 66 | 67 | Programs can typically be written in many ways with different performance profiles. When training a model to predict performance-improving edits with 68 | a large dataset, it may be trained on a mix of large and small improvements, without any information on which improvements are more desirable than others. We introduce performance tags during training by associating each “fast” program with a tag indicating the optimal achievable performance across all solutions in the dataset. 69 | 70 | perf-conditioning 71 | 72 | Specifically, the tag indicates how close that program is to peak performance on a binned-scale 73 | {1, 2, . . . , 10}. Then at test time, we prompt the model with a test input and a maximal score tag “10/10”, directing it to generate a highly-optimized solution. 74 | 75 | The performance tags are available for the [training dataset](#dataset) and can be used to train models with performance-conditioning. We also provide our fine-tuning code which adds the prompts during training and inference. 76 | 77 | ## Self-Play 78 | 79 | In an attempt to boost the performance of our models, we also investigate the use of self-play for program optimization as a data augmentation technique. Because there is a limited set of programs in our dataset, we use an off-the-shelf language model to generate new programs and a high-quality fine-tuned model to generate new optimizations. After taking some rigorous steps to ensure the generated programs are semantically novel and the optimizaitons are non-trivial, we use the generated programs and optimizations to create new program optimization pairs. 80 | 81 | 82 | The self-play notion comes from the fact that one model is used to generate the programs to solve and the other model is used to generate solve/propose the optimizations. 83 | 84 | 85 | 86 | self-play 87 | 88 | Our best model without self-play was with GPT3.5 turbo, our best fine-tuned model was trained with 4,085 high quality pairs. We were able to sample 3,314 novel programs and obtain 1,485 high-quality optimizations. 89 | 90 | Using these additional 1,485 optimizations helped improve the performance of our fine-tuned model. We also performed an ablation by adding 1,485 next-best programs from the PIE dataset for fine-tuning GPT3.5 turbo, but these pairs led to performance degradation. 91 | 92 | 93 | We provide our scripts for [sampling programs and detecting semantic duplicates](./data_augmentation/data_augmentation_driver_final.sh) and the [self-play data itself](#dataset). 94 | 95 | 96 | # Running Experiments 97 | 98 | ## Finetuning Open Source Models 99 | 100 | We provide a docker image at ```yimengzeng/pie:torch201``` which contains all of the dependencies for finetuning the model, you can also refer to ```docker/Dockerfile``` for the specific packages required to replicate the environment. 101 | 102 | To finetune codellama with the entire PIE dataset and the non-performance-conditioned prompt, run 103 | ```bash 104 | bash finetuning/train.sh 105 | ``` 106 | To finetune codellama with the performance-conditioned prompt, change the ```--prompt_template_name``` flag to ```"code_opt_w_speedup_pctile"``` More details are located in the ```finetuning``` directory. 107 | 108 | ## Finetuning OpenAI Models 109 | 110 | The script `openai_finetuning/finetune_openai.py` was used to finetune GPT3.5 Turbo. Its usage is as follows: 111 | 112 | ```bash 113 | python finetune_openai.py PATH_TO_CONFIG.yaml 114 | ``` 115 | 116 | More details and an example config file are located in the `openai_finetuning` directory. 117 | 118 | ## Dynamic Retrieval 119 | 120 | A notebook that can be used to prepare the retrieval dataset is `retrieval/retrieval.ipynb`. Given a training dataset and the test set examples to optimize, it will retrieve the K most similar training examples pairs for the given test set examples. The retrieved pairs are then used to prompt the model for optimized outputs. 121 | 122 | ## Sampling from Models 123 | 124 | To generate prompts for the models, please follow details in the paper. Additional utilities for constructing prompts are located in `finetinung/templates` and the `funetuning/utils/prompter.py` module which constructs prompts. 125 | 126 | Samples from our fine-tuned models are located [here](https://drive.google.com/drive/folders/1criq4bpLlIaINzhjUAB18NZwDtEkk0Rj?usp=sharing). 127 | 128 | #### Sampling from Open Source Models 129 | To sample optimized programs using the finetuned model with the ```text-generation-inference``` tool, first replace the ```PATH_TO_MODEL``` field to the acutal path of the finetuned model in ```server.sh```, and then to serve the model, run 130 | ```bash 131 | bash finetuning/server.sh 132 | ``` 133 | 134 | To sample from the model just served with default parameters as in the paper, run 135 | ```bash 136 | bash finetuning/sample.sh 137 | ``` 138 | 139 | More details are located in the ```finetuning``` directory. 140 | 141 | #### Sampling from OpenAI 142 | 143 | We used [prompt-lib](https://github.com/reasoning-machines/prompt-lib/tree/main) to sample from OpenAI's endpoints. 144 | 145 | ## Self-Play Experiments 146 | 147 | The directory `data_augmentation` contains the scripts used to sample and filter out novel competitive programming problems for PIE. 148 | 149 | Running ``data_augmentation/data_augmentation_driver_final.sh`` contains the final parameters we used to sample the problems. More details are located in the `data_augmentation` directory. 150 | 151 | ## Evaluation 152 | 153 | The evaluation driver is located in `gem5/gem5_eval.py`. This script requires a yaml configuration file to be passed in as an argument to `--config_path`. Example usage from the project directory would be: 154 | 155 | ```bash 156 | export PYTHONPATH=$PYTHONPATH:$(pwd) 157 | python gem5/gem5_eval.py --config_path PATH_TO_EXPERIMENT_CONFIG.yaml 158 | ``` 159 | 160 | Results from our experiments can be located in [this google drive folder](https://drive.google.com/drive/folders/1criq4bpLlIaINzhjUAB18NZwDtEkk0Rj?usp=sharing). 161 | 162 | More details are located in the `gem5` directory. 163 | 164 | ---- 165 | 166 | ## Citation 167 | 168 | ``` 169 | @inproceedings{pie_iclr_2024_spotlight, 170 | title={\href{https://openreview.net/pdf?id=ix7rLVHXyY}{Learning Performance-Improving Code Edits}}, 171 | author={Shypula, Alexander and Madaan, Aman and Zeng, Yimeng and Alon, Uri and Gardner, Jacob and Hashemi, Milad and Neubig, Graham and Ranganathan, Parthasarathy and Bastani, Osbert and Yazdanbakhsh, Amir}, 172 | booktitle={The Twelfth International Conference on Learning Representations (ICLR)}, 173 | year={2024} 174 | } 175 | ``` 176 | 177 | 178 | -------------------------------------------------------------------------------- /data_augmentation/README.md: -------------------------------------------------------------------------------- 1 | # Self-Play / Data Augmentation for PIE 2 | 3 | ## Overview 4 | 5 | This subdirectory contains the scripts used to sample and filter out novel competitive programming problems for PIE. `data_augmentation_driver_search.sh` contains the different parameters we attempted to search over to find the best parameters for the sampling, and ``data_augmentation_driver_final.sh`` contains the final parameters we used to sample the problems. -------------------------------------------------------------------------------- /data_augmentation/data_augmentation_driver_final.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PATH_TO_PIE="" 4 | WORKING_DIR="" 5 | RESULTS_DIR="" 6 | TIMEOUT=5 7 | MAX_TOKENS=16000 8 | MAX_PROMPT_LENGTH=10000 9 | MODEL="gpt-3.5-turbo-16k-0613" 10 | GENERATION_STRATEGY="code_only" 11 | 12 | 13 | ## if constant factor, expected 10K programs for $15-$20 14 | PARAM_SETS=( 15 | "1.0 0.9 5 2000" # Temperature=1.0, Top-p=0.9, Num_Samples=5, Total_Iterations=2000; 10K generations 16 | ) 17 | 18 | # Loop through each set of parameters and run the Python script 19 | for PARAM_SET in "${PARAM_SETS[@]}"; do 20 | # Split the parameter set into individual variables 21 | read -r TEMPERATURE TOP_P NUM_SAMPLES TOTAL_ITERATIONS <<< "$PARAM_SET" 22 | 23 | echo "Running with Temperature=$TEMPERATURE, Top-p=$TOP_P, Num_Samples=$NUM_SAMPLES, Total_Iterations=$TOTAL_ITERATIONS" 24 | 25 | python3 $PATH_TO_PIE/src/data_augmentation/data_augmentation.py \ 26 | --working_dir $WORKING_DIR \ 27 | --results_dir_root $RESULTS_DIR \ 28 | --timeout $TIMEOUT \ 29 | --temperature $TEMPERATURE \ 30 | --top_p $TOP_P \ 31 | --max_tokens $MAX_TOKENS \ 32 | --max_prompt_length $MAX_PROMPT_LENGTH \ 33 | --model $MODEL \ 34 | --generation_strategy $GENERATION_STRATEGY \ 35 | --num_samples $NUM_SAMPLES \ 36 | --total_iterations $TOTAL_ITERATIONS 2>&1 | tee $RESULTS_DIR/temperature_${TEMPERATURE}_top_p_${TOP_P}_num_samples_${NUM_SAMPLES}_total_iterations_${TOTAL_ITERATIONS}.log 37 | done:q 38 | -------------------------------------------------------------------------------- /data_augmentation/data_augmentation_driver_search.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PATH_TO_PIE="" 4 | WORKING_DIR="" 5 | RESULTS_DIR="" 6 | TIMEOUT=5 7 | MAX_TOKENS=16000 8 | MAX_PROMPT_LENGTH=10000 9 | MODEL="gpt-3.5-turbo-16k-0613" 10 | GENERATION_STRATEGY="code_only" 11 | 12 | PARAM_SETS=( 13 | "1.2 0.8 5 20" 14 | "1 0.8 5 20" 15 | "1 0.9 5 20" 16 | "0.8 1 5 20" 17 | "0.5 1 5 20" 18 | "1 0.8 1 100" 19 | "1 0.9 1 100" 20 | "1 1 1 100" 21 | "0.7 1 1 100" 22 | "0.5 1 1 100" 23 | ) 24 | 25 | 26 | # Loop through each set of parameters and run the Python script 27 | for PARAM_SET in "${PARAM_SETS[@]}"; do 28 | # Split the parameter set into individual variables 29 | read -r TEMPERATURE TOP_P NUM_SAMPLES TOTAL_ITERATIONS <<< "$PARAM_SET" 30 | 31 | echo "Running with Temperature=$TEMPERATURE, Top-p=$TOP_P, Num_Samples=$NUM_SAMPLES, Total_Iterations=$TOTAL_ITERATIONS" 32 | 33 | python3 $PATH_TO_PIE/src/data_augmentation/data_augmentation.py \ 34 | --working_dir $WORKING_DIR \ 35 | --results_dir_root $RESULTS_DIR \ 36 | --timeout $TIMEOUT \ 37 | --temperature $TEMPERATURE \ 38 | --top_p $TOP_P \ 39 | --max_tokens $MAX_TOKENS \ 40 | --max_prompt_length $MAX_PROMPT_LENGTH \ 41 | --model $MODEL \ 42 | --generation_strategy $GENERATION_STRATEGY \ 43 | --num_samples $NUM_SAMPLES \ 44 | --total_iterations $TOTAL_ITERATIONS 2>&1 | tee $RESULTS_DIR/temperature_${TEMPERATURE}_top_p_${TOP_P}_num_samples_${NUM_SAMPLES}_total_iterations_${TOTAL_ITERATIONS}.log 45 | done:q 46 | -------------------------------------------------------------------------------- /docs/PIE_experiments_readme.md: -------------------------------------------------------------------------------- 1 | ## PIE Experiments 2 | 3 | The following directory containe the outputs from our experiments generally in the following format 4 | 5 | - `aggregated_test_results.csv`: contains final statistics including those found in the paper as well as additional statistics, such as %Opt reported in the paper at higher thresholds for speedup like 1.5x, 2.0x and so on. 6 | - `test_results.jsonl`: contains outputs and additional metrics after benchmarking with `gem5` 7 | - `melted_test_results.jsonl`: this is an intermediate file containing test_results.jsonl melted into a long format for easier analysis. 8 | - `additional_test_results.jsonl`: is an intermediate file which we did not use and may or may not contain additional `gem5` benchmarking information. 9 | - `raw_test_results.jsonl`: is an intermediate file for test_results.jsonl without benchmarking info. This usually will exist, but in some cases it won't. test_results.jsonl and melted_test_results.jsonl are derived from this file and will also contain any information in this file in addition to more. -------------------------------------------------------------------------------- /docs/images/38_animation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/docs/images/38_animation.gif -------------------------------------------------------------------------------- /docs/images/46_animation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/docs/images/46_animation.gif -------------------------------------------------------------------------------- /docs/images/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/docs/images/arch.png -------------------------------------------------------------------------------- /docs/images/performance_conditioning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/docs/images/performance_conditioning.png -------------------------------------------------------------------------------- /docs/images/self_play.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/docs/images/self_play.drawio.png -------------------------------------------------------------------------------- /finetuning/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | 163 | wandb 164 | wandb/* 165 | saved_models/* 166 | saved_models/ -------------------------------------------------------------------------------- /finetuning/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2024 Alexander G Shypula, Aman Madaan, Yimeng Zeng, Uri Alon, Jacob R. Gardner, Yiming Yang, Milad Hashemi, Graham Neubig, Parthasarathy Ranganathan, Osbert Bastani, Amir Yazdanbakhsh 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /finetuning/README.md: -------------------------------------------------------------------------------- 1 | # Finetuning Scripts for codellama 2 | 3 | ## Overview 4 | 5 | This subdirectory contains the scripts used to finetune codellama models for PIE. ``train.sh`` contains an example bash script for finetuning codellama-7b with the default prompt. ``sample.py`` and ``server.sh`` are used for sampling with the prompt templates and the [text-generation-inference](https://github.com/huggingface/text-generation-inference) API. 6 | 7 | ## Docker Setup for Finetuning 8 | 9 | To use the provided Docker image for finetuning, you need to install Docker and mount the directory properly. Follow these steps: 10 | 11 | 1. Install Docker: Follow the instructions on the [official Docker website](https://docs.docker.com/get-docker/) to install Docker on your system. 12 | 13 | 2. Mount the directory for the data: When running the Docker container, use the `-v` option to mount the directory containing your data. For example: 14 | ```bash 15 | docker run -v /path/to/your/data:/workspace/data yimengzeng/pie:torch201 16 | ``` 17 | 18 | ## Finetuning 19 | 20 | We provide a docker image at yimengzeng/pie:torch201 which contains all of the dependencies for finetuning the model, you can also refer to docker/Dockerfile for the specific packages required to replicate the environment. 21 | 22 | To finetune codellama with the entire PIE dataset and the non-performance-conditioned prompt, run 23 | 24 | ```bash 25 | bash train.sh 26 | ``` 27 | 28 | inside the Docker container. 29 | 30 | To finetune codellama with the performance-conditioned prompt, change the --prompt_template_name flag to "code_opt_w_speedup_pctile". 31 | 32 | To use different training files, modify the --train_name, --val_name, and --test_name flags in train.sh with the paths to your training, validation, and test files, respectively and mount the directory containing the files when running the Docker container. 33 | 34 | 35 | ## Sampling 36 | 37 | To generate prompts for the models, please follow details in the paper. Additional utilities for constructing prompts are located in `templates` and the `utils/prompter.py` module which constructs prompts. 38 | 39 | To sample optimized programs using the finetuned model with the text-generation-inference tool, first replace the PATH_TO_MODEL field to the actual path of the finetuned model in server.sh, and then to serve the model, run 40 | 41 | ```bash 42 | bash server.sh 43 | ``` 44 | To sample from the model just served with default parameters as in the paper, run 45 | 46 | ```bash 47 | bash sample.sh 48 | ``` 49 | Note that sampling does not require you to spin up the container on your own. You can modify the following parameters in `server.sh` and `sample.sh`: 50 | 51 | For `server.sh`: 52 | - `model`: Set this to the path of your finetuned model, e.g., `'codellama/CodeLlama-7b-hf'`. 53 | - `volume`: Set this to the path where your model is stored, e.g., `$PWD/saved_models/`. 54 | - `max_best_of`: Set this to the maximum number of samples to generate in parallel, e.g., `20`. 55 | 56 | For `sample.sh`: 57 | - `--test_file`: Set this to the path of your test file. 58 | - `--output_file`: Set this to the path where you want to save the results. 59 | - `--num_samples`: Set this to the number of samples you want to generate. 60 | - `--num_threads`: Set this to the number of threads you want to use for sampling. 61 | - `--prompt_name`: Set this to the name of the prompt template you want to use. 62 | - `--temperature`: Set this to the temperature parameter for sampling. 63 | 64 | 65 | ## Models 66 | 67 | Here are links to the finetuned models used in the paper and the corresponding pre-trained models used for finetuning: 68 | 69 | | Experiment | Model | Type | Pretrained Link | Finetuned Link | 70 | |------------|-------|------|-----------------|----------------| 71 | | All | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-all-uncon-7b](https://huggingface.co/LearningOpt/pie-all-uncon-7b) | 72 | | All | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-all-uncon-13b](https://huggingface.co/LearningOpt/pie-all-uncon-13b) | 73 | | HQ | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-hq-selfplay-7b](https://huggingface.co/LearningOpt/pie-hq-selfplay-7b) | 74 | | HQ | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-hq-selfplay-13b](https://huggingface.co/LearningOpt/pie-hq-selfplay-13b) | 75 | | All w/Perf-Cond | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-conditioned-7b](https://huggingface.co/LearningOpt/pie-conditioned-7b) | 76 | | All w/Perf-Cond | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-conditioned-13b](https://huggingface.co/LearningOpt/pie-conditioned-13b) | 77 | | HQ + Self-Play | CodeLlama 7B | FT | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LearningOpt/pie-hq-selfplay-7b](https://huggingface.co/LearningOpt/pie-hq-selfplay-7b) | 78 | | HQ + Self-Play | CodeLlama 13B | FT | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [LearningOpt/pie-hq-selfplay-13b](https://huggingface.co/LearningOpt/pie-hq-selfplay-13b) | 79 | -------------------------------------------------------------------------------- /finetuning/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel 2 | 3 | RUN apt-get update && apt-get install -y wget git 4 | RUN pip install tqdm wandb lightning fire rotary_embedding_torch pandas numpy matplotlib transformers datasets peft text_generation ninja packaging 5 | RUN MAX_JOBS=32 pip install flash-attn --no-build-isolation 6 | RUN pip install accelerate bitsandbytes 7 | RUN pip install scipy 8 | RUN pip install sentencepiece 9 | 10 | # ENV WANDB_API_KEY=yourkeyhere 11 | WORKDIR /workspace -------------------------------------------------------------------------------- /finetuning/finetune.py: -------------------------------------------------------------------------------- 1 | """ 2 | Finetune the model on the codellama dataset 3 | 4 | Code adapted from the alpaca-lora repository at https://github.com/tloen/alpaca-lora/blob/main/finetune.py 5 | """ 6 | 7 | import os 8 | import sys 9 | from typing import List 10 | 11 | import fire 12 | import torch 13 | import transformers 14 | from datasets import load_dataset 15 | 16 | """ 17 | Unused imports: 18 | import torch.nn as nn 19 | import bitsandbytes as bnb 20 | """ 21 | 22 | from transformers import AutoModelForCausalLM, CodeLlamaTokenizer 23 | 24 | from utils.prompter import Prompter 25 | 26 | 27 | def train( 28 | # model/data params 29 | base_model: str = "codellama/CodeLlama-13b-hf", 30 | data_path: str = "data/code_data", 31 | output_dir: str = "./code_opt/13b-test", 32 | # training hyperparams 33 | batch_size: int = 128, 34 | micro_batch_size: int = 2, 35 | num_epochs: int = 3, 36 | learning_rate: float = 3e-4, 37 | cutoff_len: int = 1024, 38 | val_set_size: int = 2000, 39 | train_on_inputs: bool = True, # if False, masks out inputs in loss 40 | add_eos_token: bool = True, 41 | group_by_length: bool = False, 42 | # wandb params 43 | wandb_project: str = "code-llama", 44 | wandb_run_name: str = "", 45 | wandb_watch: str = "", # options: false | gradients | all 46 | wandb_log_model: str = "", # options: false | true 47 | resume_from_checkpoint: str = None, # either training checkpoint or final adapter 48 | prompt_template_name: str = "code_opt", # The prompt template to use, will default to code_opt. 49 | use_flash_attention = True, 50 | use_wandb: bool = True, # if True, will use wandb if wandb_project is set 51 | # training data and prompt template 52 | train_name: str = "train.jsonl", 53 | val_name: str = "val.jsonl", 54 | test_name: str = "test.jsonl", 55 | with_speedup_desc: bool = False, # if True, we use templates/code_opt_w_speedup_desc.json 56 | with_speedup_bin: bool = False, # if True, we use templates/code_opt_w_speedup_bin.json 57 | with_pctile: bool = False, # if True, we use templates/code_opt_w_speedup_pctile.json 58 | ): 59 | if with_speedup_desc and with_speedup_bin: 60 | raise ValueError("Both with_speedup_desc and with_speedup_bin can not be TRUE!!!") 61 | if int(os.environ.get("LOCAL_RANK", 0)) == 0: 62 | print( 63 | f"Training code_opt-LoRA model with params:\n" 64 | f"base_model: {base_model}\n" 65 | f"data_path: {data_path}\n" 66 | f"output_dir: {output_dir}\n" 67 | f"batch_size: {batch_size}\n" 68 | f"micro_batch_size: {micro_batch_size}\n" 69 | f"num_epochs: {num_epochs}\n" 70 | f"learning_rate: {learning_rate}\n" 71 | f"cutoff_len: {cutoff_len}\n" 72 | f"val_set_size: {val_set_size}\n" 73 | f"train_on_inputs: {train_on_inputs}\n" 74 | f"add_eos_token: {add_eos_token}\n" 75 | f"group_by_length: {group_by_length}\n" 76 | f"wandb_project: {wandb_project}\n" 77 | f"wandb_run_name: {wandb_run_name}\n" 78 | f"wandb_watch: {wandb_watch}\n" 79 | f"wandb_log_model: {wandb_log_model}\n" 80 | f"resume_from_checkpoint: {resume_from_checkpoint or False}\n" 81 | f"prompt template: {prompt_template_name}\n" 82 | f"Train File: {os.path.join(data_path, train_name)}\n" 83 | f"Val File: {os.path.join(data_path, val_name)}\n" 84 | f"Test File: {os.path.join(data_path, test_name)}\n" 85 | ) 86 | assert ( 87 | base_model 88 | ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'" 89 | gradient_accumulation_steps = batch_size // micro_batch_size 90 | 91 | if with_speedup_desc: 92 | prompter = Prompter(template_name="code_opt_w_speedup_desc") 93 | elif with_speedup_bin: 94 | prompter = Prompter(template_name="code_opt_w_speedup_bin") 95 | elif with_pctile: 96 | prompter = Prompter(template_name="code_opt_w_speedup_pctile") 97 | else: 98 | prompter = Prompter(prompt_template_name) 99 | 100 | device_map = "auto" 101 | world_size = int(os.environ.get("WORLD_SIZE", 1)) 102 | ddp = world_size != 1 103 | if ddp: 104 | device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} 105 | gradient_accumulation_steps = gradient_accumulation_steps // world_size 106 | 107 | if use_wandb: 108 | # Check if parameter passed or if set within environ 109 | use_wandb = len(wandb_project) > 0 or ( 110 | "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0 111 | ) 112 | # Only overwrite environ if wandb param passed 113 | if len(wandb_project) > 0: 114 | os.environ["WANDB_PROJECT"] = wandb_project 115 | if len(wandb_watch) > 0: 116 | os.environ["WANDB_WATCH"] = wandb_watch 117 | if len(wandb_log_model) > 0: 118 | os.environ["WANDB_LOG_MODEL"] = wandb_log_model 119 | 120 | # make sure to have latest version of transformers library and flash attention installed 121 | model = AutoModelForCausalLM.from_pretrained( 122 | base_model, 123 | torch_dtype=torch.bfloat16, 124 | device_map=device_map, 125 | attn_implementation="flash_attention_2", 126 | ) 127 | 128 | tokenizer = CodeLlamaTokenizer.from_pretrained(base_model) 129 | 130 | tokenizer.pad_token_id = ( 131 | 0 # unk. 132 | ) 133 | tokenizer.padding_side = "left" # Allow batched inference 134 | 135 | def tokenize(prompt, add_eos_token=True): 136 | result = tokenizer( 137 | prompt, 138 | truncation=True, 139 | max_length=cutoff_len, 140 | padding=False, 141 | return_tensors=None, 142 | ) 143 | if ( 144 | result["input_ids"][-1] != tokenizer.eos_token_id 145 | and len(result["input_ids"]) < cutoff_len 146 | and add_eos_token 147 | ): 148 | result["input_ids"].append(tokenizer.eos_token_id) 149 | result["attention_mask"].append(1) 150 | 151 | result["labels"] = result["input_ids"].copy() 152 | 153 | return result 154 | 155 | def generate_and_tokenize_prompt(data_point): 156 | full_prompt = prompter.generate_prompt( 157 | data_point["src_code"], 158 | data_point["tgt_code"], 159 | speedup_desc=data_point["speedup_desc"] if with_speedup_desc else None, 160 | speedup_bin=data_point["speedup_bin"] if with_speedup_bin else None, 161 | pctile=data_point["target_reward_updated_pct_bin"] if "target_reward_updated_pct_bin" in data_point else None, 162 | ) 163 | tokenized_full_prompt = tokenize(full_prompt) 164 | if not train_on_inputs: 165 | user_prompt = prompter.generate_prompt( 166 | data_point["src_code"], 167 | speedup_desc=data_point["speedup_desc"] if with_speedup_desc else None, 168 | speedup_bin=data_point["speedup_bin"] if with_speedup_bin else None, 169 | pctile=data_point["target_reward_updated_pct_bin"] if "target_reward_updated_pct_bin" in data_point else None, 170 | ) 171 | tokenized_user_prompt = tokenize( 172 | user_prompt, add_eos_token=add_eos_token 173 | ) 174 | user_prompt_len = len(tokenized_user_prompt["input_ids"]) 175 | 176 | if add_eos_token: 177 | user_prompt_len -= 1 178 | 179 | tokenized_full_prompt["labels"] = [ 180 | -100 181 | ] * user_prompt_len + tokenized_full_prompt["labels"][ 182 | user_prompt_len: 183 | ] # could be sped up, probably 184 | return tokenized_full_prompt 185 | 186 | ## Loading data 187 | 188 | datafiles = {'train': f'{data_path}//{train_name}', 'test': f'{data_path}//{test_name}', 'validation': f'{data_path}//{val_name}'} 189 | 190 | data = load_dataset("json", data_files=datafiles) 191 | 192 | print(f"Is training on inputs: {train_on_inputs}") 193 | 194 | train_data = data["train"].shuffle().map(generate_and_tokenize_prompt) 195 | val_data = data["validation"].shuffle().map(generate_and_tokenize_prompt) 196 | 197 | # model.print_trainable_parameters() # Be more transparent about the % of trainable params. 198 | 199 | if not ddp and torch.cuda.device_count() > 1: 200 | # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available 201 | model.is_parallelizable = True 202 | model.model_parallel = True 203 | 204 | trainer = transformers.Trainer( 205 | model=model, 206 | train_dataset=train_data, 207 | eval_dataset=val_data, 208 | args=transformers.TrainingArguments( 209 | per_device_train_batch_size=micro_batch_size, 210 | gradient_accumulation_steps=gradient_accumulation_steps, 211 | warmup_steps=100, 212 | num_train_epochs=num_epochs, 213 | learning_rate=learning_rate, 214 | bf16=True, 215 | logging_steps=1, 216 | optim="adamw_torch", 217 | evaluation_strategy="steps" if val_set_size > 0 else "no", 218 | save_strategy="steps", 219 | eval_steps=50 if val_set_size > 0 else None, 220 | save_steps=50, 221 | output_dir=output_dir, 222 | save_total_limit=10, 223 | load_best_model_at_end=True if val_set_size > 0 else False, 224 | ddp_find_unused_parameters=False if ddp else None, 225 | group_by_length=group_by_length, 226 | report_to="wandb" if use_wandb else None, 227 | run_name=wandb_run_name if use_wandb else "none", 228 | fsdp=["full_shard", "auto_wrap"], 229 | gradient_checkpointing=True, 230 | resume_from_checkpoint=f"{output_dir}" if resume_from_checkpoint else None, 231 | ), 232 | data_collator=transformers.DataCollatorForSeq2Seq( 233 | tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 234 | ), 235 | ) 236 | model.config.use_cache = False 237 | 238 | if torch.__version__ >= "2" and sys.platform != "win32": 239 | model = torch.compile(model) 240 | 241 | trainer.train(resume_from_checkpoint=resume_from_checkpoint) 242 | 243 | model.save_pretrained(output_dir, max_shard_size="100GB") # save in 1 shard to work with tgi docker image, have bugs with multiple shards 244 | 245 | print( 246 | "\n If there's a warning about missing keys above, please disregard :)" 247 | ) 248 | 249 | 250 | if __name__ == "__main__": 251 | fire.Fire(train) 252 | -------------------------------------------------------------------------------- /finetuning/sample.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code used for sampling programs based on the text-generation-inference API at https://github.com/huggingface/text-generation-inference 3 | 4 | """ 5 | 6 | 7 | from text_generation import Client 8 | import pandas as pd 9 | from utils.prompter import Prompter 10 | from tqdm import tqdm 11 | import fire 12 | import re 13 | 14 | import concurrent.futures 15 | 16 | def extract_first_program(text): 17 | # Look for the main function's start, considering possible non-standard code 18 | main_start = re.search(r"\b(?:int\s+)?main\b", text) 19 | 20 | if not main_start: 21 | return text # Return original if main is not found 22 | 23 | open_braces = 0 24 | closing_brace_position = -1 25 | main_function_started = False 26 | 27 | # Start looking for opening brace after the detected main function 28 | i = main_start.end() 29 | 30 | while i < len(text): 31 | if text[i] == "{": 32 | open_braces += 1 33 | if not main_function_started: 34 | main_function_started = True 35 | 36 | elif text[i] == "}": 37 | open_braces -= 1 38 | if open_braces == 0 and main_function_started: 39 | closing_brace_position = i 40 | break 41 | 42 | i += 1 43 | 44 | # If we found a closing brace for the first program 45 | if closing_brace_position != -1: 46 | return text[: closing_brace_position + 1] 47 | else: 48 | return text # Return original text if a matching closing brace wasn't found 49 | 50 | 51 | def postprocess(text, prompt_name): 52 | 53 | if prompt_name == 'code_opt': 54 | return extract_first_program(text) 55 | else: 56 | return text 57 | 58 | 59 | def main( 60 | test_file=None, 61 | output_file=None, 62 | do_sample=None, 63 | num_samples=8, 64 | max_new_tokens=1000, 65 | temperature=0.7, 66 | num_threads=20, # number of threads to use for parallel processing 67 | prompt_name="code_opt", 68 | ): 69 | # print do_sample 70 | print(f"do_sample: {do_sample}") 71 | # print type of do_sample 72 | print(f"type of do_sample: {type(do_sample)}") 73 | 74 | client = Client("http://127.0.0.1:8080", timeout=100) 75 | 76 | prompter = Prompter(template_name=prompt_name) 77 | 78 | print(f"prompt_name: {prompt_name}") 79 | 80 | test_df = pd.read_json(test_file, lines=True, orient="records") 81 | 82 | # create results dataframe with src_code column 83 | results_df = pd.DataFrame(columns=["src_code"]) 84 | results_df["src_code"] = test_df["src_code"] 85 | # create empty column for completions 86 | results_df["generated_answers"] = results_df.apply(lambda x: [], axis=1) 87 | 88 | def process_request(index, src_code): 89 | all_completions = [] 90 | 91 | prompt = prompter.generate_prompt(src_code=src_code) 92 | 93 | if do_sample: 94 | completions = client.generate( 95 | prompt, 96 | max_new_tokens=max_new_tokens, 97 | do_sample=True, 98 | temperature=temperature, 99 | best_of=num_samples, 100 | ) 101 | else: 102 | completions = client.generate( 103 | prompt, 104 | max_new_tokens=max_new_tokens, 105 | do_sample=False, 106 | # best_of=num_samples, 107 | ) 108 | 109 | # get all completions from output 110 | best_of_sequences = [ 111 | completions.details.best_of_sequences[i].generated_text 112 | for i in range(len(completions.details.best_of_sequences)) 113 | ] 114 | 115 | all_programs = [postprocess(completions.generated_text, prompt_name=prompt_name)] + [ 116 | postprocess(best_of_sequences[i], prompt_name=prompt_name) 117 | for i in range(len(best_of_sequences)) 118 | ] 119 | 120 | return index, all_programs 121 | 122 | # Use ThreadPoolExecutor to process in parallel 123 | with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: 124 | future_to_index = {executor.submit(process_request, i, row["src_code"]): i for i, row in test_df.iterrows()} 125 | for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(test_df)): 126 | index, all_programs = future.result() 127 | results_df.at[index, "generated_answers"] = all_programs 128 | 129 | # add generated_answers column to test_df 130 | test_df["generated_answers"] = results_df["generated_answers"] 131 | 132 | # save test_df to output_file 133 | test_df.to_json(output_file, orient="records", lines=True) 134 | 135 | 136 | if __name__ == "__main__": 137 | fire.Fire(main) 138 | -------------------------------------------------------------------------------- /finetuning/sample.sh: -------------------------------------------------------------------------------- 1 | python sample.py \ 2 | --test_file PATH_TO_TEST_FILE/test_file.jsonl \ 3 | --output_file PATH_TO_OUTPUTS/results.jsonl \ 4 | --do_sample True \ 5 | --num_samples 8 \ 6 | --num_threads 8 \ 7 | --prompt_name code_opt \ 8 | --prompt_name code_opt \ 9 | --temperature 0.7 -------------------------------------------------------------------------------- /finetuning/server.sh: -------------------------------------------------------------------------------- 1 | model='PATH_TO_MODEL' # 'codellama/CodeLlama-7b-hf' for example 2 | volume=$PWD/saved_models/ # share a volume with the Docker container to avoid downloading weights every run 3 | max_best_of=20 # max number of samples to generate in parallel 4 | 5 | docker run -e NVIDIA_VISIBLE_DEVICES="0,1,2,3" --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest \ 6 | --model-id $model --max-best-of $max_best_of 7 | -------------------------------------------------------------------------------- /finetuning/templates/code_opt.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used for program optimization.", 3 | "prompt_no_input": "Below is a program. Optimize the program and provide a more efficient version.\n\n### Program:\n{src_code}\n\n### Optimized Version:\n", 4 | "response_split": "### Optimized Version:" 5 | } 6 | -------------------------------------------------------------------------------- /finetuning/templates/code_opt_w_speedup_bin.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used for program optimization.", 3 | "prompt_no_input": "Below is a program. Optimize the program and provide a more efficient version with at least {speedup_bin} speedup.\n\n### Program:\n{src_code}\n\n### Optimized Version with at least {speedup_bin}:\n", 4 | "response_split": "### Optimized Version:" 5 | } 6 | -------------------------------------------------------------------------------- /finetuning/templates/code_opt_w_speedup_desc.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used for program optimization.", 3 | "prompt_no_input": "Below is a program. Optimize the program and provide a {speedup_desc} version.\n\n### Program:\n{src_code}\n\n### {speedup_desc} optimized Version:\n", 4 | "response_split": "### Optimized Version:" 5 | } 6 | -------------------------------------------------------------------------------- /finetuning/templates/code_opt_w_speedup_pctile.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used for program optimization.", 3 | "prompt_no_input": "This is a slow program we want to optimize {pctile}/10. ### Program:\n{src_code}\n\n### Optimized Version with score {pctile}/10:\n", 4 | "response_split": "### Optimized Version:" 5 | } -------------------------------------------------------------------------------- /finetuning/tokenizer_files/13B/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": { 3 | "content": "", 4 | "lstrip": false, 5 | "normalized": true, 6 | "rstrip": false, 7 | "single_word": false 8 | }, 9 | "eos_token": { 10 | "content": "", 11 | "lstrip": false, 12 | "normalized": true, 13 | "rstrip": false, 14 | "single_word": false 15 | }, 16 | "unk_token": { 17 | "content": "", 18 | "lstrip": false, 19 | "normalized": true, 20 | "rstrip": false, 21 | "single_word": false 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /finetuning/tokenizer_files/13B/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/finetuning/tokenizer_files/13B/tokenizer.model -------------------------------------------------------------------------------- /finetuning/tokenizer_files/13B/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_bos_token": true, 3 | "add_eos_token": false, 4 | "bos_token": { 5 | "__type": "AddedToken", 6 | "content": "", 7 | "lstrip": false, 8 | "normalized": true, 9 | "rstrip": false, 10 | "single_word": false 11 | }, 12 | "clean_up_tokenization_spaces": false, 13 | "eos_token": { 14 | "__type": "AddedToken", 15 | "content": "", 16 | "lstrip": false, 17 | "normalized": true, 18 | "rstrip": false, 19 | "single_word": false 20 | }, 21 | "legacy": null, 22 | "model_max_length": 1000000000000000019884624838656, 23 | "pad_token": null, 24 | "sp_model_kwargs": {}, 25 | "tokenizer_class": "CodeLlamaTokenizer", 26 | "unk_token": { 27 | "__type": "AddedToken", 28 | "content": "", 29 | "lstrip": false, 30 | "normalized": true, 31 | "rstrip": false, 32 | "single_word": false 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /finetuning/tokenizer_files/7B/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "additional_special_tokens": [ 3 | "▁
",
 4 |     "▁",
 5 |     "▁",
 6 |     "▁"
 7 |   ],
 8 |   "bos_token": {
 9 |     "content": "",
10 |     "lstrip": false,
11 |     "normalized": true,
12 |     "rstrip": false,
13 |     "single_word": false
14 |   },
15 |   "eos_token": {
16 |     "content": "",
17 |     "lstrip": false,
18 |     "normalized": true,
19 |     "rstrip": false,
20 |     "single_word": false
21 |   },
22 |   "unk_token": {
23 |     "content": "",
24 |     "lstrip": false,
25 |     "normalized": true,
26 |     "rstrip": false,
27 |     "single_word": false
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/finetuning/tokenizer_files/7B/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/finetuning/tokenizer_files/7B/tokenizer.model


--------------------------------------------------------------------------------
/finetuning/tokenizer_files/7B/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "add_bos_token": true,
 3 |   "add_eos_token": false,
 4 |   "additional_special_tokens": [
 5 |     "▁
",
 6 |     "▁",
 7 |     "▁",
 8 |     "▁"
 9 |   ],
10 |   "bos_token": {
11 |     "__type": "AddedToken",
12 |     "content": "",
13 |     "lstrip": false,
14 |     "normalized": true,
15 |     "rstrip": false,
16 |     "single_word": false
17 |   },
18 |   "clean_up_tokenization_spaces": false,
19 |   "eos_token": {
20 |     "__type": "AddedToken",
21 |     "content": "",
22 |     "lstrip": false,
23 |     "normalized": true,
24 |     "rstrip": false,
25 |     "single_word": false
26 |   },
27 |   "eot_token": "▁",
28 |   "fill_token": "",
29 |   "legacy": null,
30 |   "middle_token": "▁",
31 |   "model_max_length": 1000000000000000019884624838656,
32 |   "pad_token": null,
33 |   "prefix_token": "▁
",
34 |   "sp_model_kwargs": {},
35 |   "suffix_first": false,
36 |   "suffix_token": "▁",
37 |   "tokenizer_class": "CodeLlamaTokenizer",
38 |   "unk_token": {
39 |     "__type": "AddedToken",
40 |     "content": "",
41 |     "lstrip": false,
42 |     "normalized": true,
43 |     "rstrip": false,
44 |     "single_word": false
45 |   },
46 |   "use_default_system_prompt": false
47 | }
48 | 


--------------------------------------------------------------------------------
/finetuning/train.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${OUTPUT_DIR:-"saved_models/code_opt"}
 2 | BASE_MODEL=${BASE_MODEL:-"codellama/CodeLlama-7b-hf"}
 3 | 
 4 | torchrun --nproc_per_node=8 \
 5 |     --master_port=1234 finetune.py \
 6 |     --base_model $BASE_MODEL \
 7 |     --data_path ./data/ \
 8 |     --output_dir $OUTPUT_DIR \
 9 |     --batch_size 32 \
10 |     --micro_batch_size 2 \
11 |     --num_epochs 1 \
12 |     --learning_rate 1e-5 \
13 |     --cutoff_len 2000 \
14 |     --train_on_inputs False \
15 |     --prompt_template_name "code_opt" \
16 |     --use_flash_attention True \
17 |     --train_name "train.jsonl" \
18 |     --val_name "val.jsonl" \
19 |     --test_name "test.jsonl" \
20 |     --wandb_project "code_opt" \
21 | 
22 | # Copy tokenizer files to appropriate location, modify this if model is different
23 | if [[ $BASE_MODEL == *"7b"* ]]; then
24 |     cp -r ./tokenizer_files/7B/* $OUTPUT_DIR
25 | elif [[ $BASE_MODEL == *"13b"* ]]; then
26 |     cp -r ./tokenizer_files/13B/* $OUTPUT_DIR
27 | else
28 |     echo "Base model size not recognized. Tokenizer files not copied."
29 | fi
30 | 


--------------------------------------------------------------------------------
/finetuning/utils/convert_to_safetensors.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import shutil
  5 | from collections import defaultdict
  6 | from inspect import signature
  7 | from tempfile import TemporaryDirectory
  8 | from typing import Dict, List, Optional, Set, Tuple
  9 | 
 10 | import torch
 11 | 
 12 | from huggingface_hub import CommitInfo, CommitOperationAdd, Discussion, HfApi, hf_hub_download
 13 | from huggingface_hub.file_download import repo_folder_name
 14 | from safetensors.torch import load_file, save_file
 15 | from transformers import AutoConfig
 16 | 
 17 | 
 18 | COMMIT_DESCRIPTION = """
 19 | This is an automated PR created with https://huggingface.co/spaces/safetensors/convert
 20 | 
 21 | This new file is equivalent to `pytorch_model.bin` but safe in the sense that
 22 | no arbitrary code can be put into it.
 23 | 
 24 | These files also happen to load much faster than their pytorch counterpart:
 25 | https://colab.research.google.com/github/huggingface/notebooks/blob/main/safetensors_doc/en/speed.ipynb
 26 | 
 27 | The widgets on your model page will run using this model even if this is not merged
 28 | making sure the file actually works.
 29 | 
 30 | If you find any issues: please report here: https://huggingface.co/spaces/safetensors/convert/discussions
 31 | 
 32 | Feel free to ignore this PR.
 33 | """
 34 | 
 35 | ConversionResult = Tuple[List["CommitOperationAdd"], List[Tuple[str, "Exception"]]]
 36 | 
 37 | 
 38 | class AlreadyExists(Exception):
 39 |     pass
 40 | 
 41 | 
 42 | def shared_pointers(tensors):
 43 |     ptrs = defaultdict(list)
 44 |     for k, v in tensors.items():
 45 |         ptrs[v.data_ptr()].append(k)
 46 |     failing = []
 47 |     for ptr, names in ptrs.items():
 48 |         if len(names) > 1:
 49 |             failing.append(names)
 50 |     return failing
 51 | 
 52 | 
 53 | def check_file_size(sf_filename: str, pt_filename: str):
 54 |     sf_size = os.stat(sf_filename).st_size
 55 |     pt_size = os.stat(pt_filename).st_size
 56 | 
 57 |     if (sf_size - pt_size) / pt_size > 0.01:
 58 |         raise RuntimeError(
 59 |             f"""The file size different is more than 1%:
 60 |          - {sf_filename}: {sf_size}
 61 |          - {pt_filename}: {pt_size}
 62 |          """
 63 |         )
 64 | 
 65 | 
 66 | def rename(pt_filename: str) -> str:
 67 |     filename, ext = os.path.splitext(pt_filename)
 68 |     local = f"{filename}.safetensors"
 69 |     local = local.replace("pytorch_model", "model")
 70 |     return local
 71 | 
 72 | 
 73 | def convert_multi(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
 74 |     filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json", token=token, cache_dir=folder)
 75 |     with open(filename, "r") as f:
 76 |         data = json.load(f)
 77 | 
 78 |     filenames = set(data["weight_map"].values())
 79 |     local_filenames = []
 80 |     for filename in filenames:
 81 |         pt_filename = hf_hub_download(repo_id=model_id, filename=filename, token=token, cache_dir=folder)
 82 | 
 83 |         sf_filename = rename(pt_filename)
 84 |         sf_filename = os.path.join(folder, sf_filename)
 85 |         convert_file(pt_filename, sf_filename)
 86 |         local_filenames.append(sf_filename)
 87 | 
 88 |     index = os.path.join(folder, "model.safetensors.index.json")
 89 |     with open(index, "w") as f:
 90 |         newdata = {k: v for k, v in data.items()}
 91 |         newmap = {k: rename(v) for k, v in data["weight_map"].items()}
 92 |         newdata["weight_map"] = newmap
 93 |         json.dump(newdata, f, indent=4)
 94 |     local_filenames.append(index)
 95 | 
 96 |     operations = [
 97 |         CommitOperationAdd(path_in_repo=local.split("/")[-1], path_or_fileobj=local) for local in local_filenames
 98 |     ]
 99 |     errors: List[Tuple[str, "Exception"]] = []
100 | 
101 |     return operations, errors
102 | 
103 | 
104 | def convert_single(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
105 |     pt_filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin", token=token, cache_dir=folder)
106 | 
107 |     sf_name = "model.safetensors"
108 |     sf_filename = os.path.join(folder, sf_name)
109 |     convert_file(pt_filename, sf_filename)
110 |     operations = [CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=sf_filename)]
111 |     errors: List[Tuple[str, "Exception"]] = []
112 |     return operations, errors
113 | 
114 | 
115 | def convert_file(
116 |     pt_filename: str,
117 |     sf_filename: str,
118 | ):
119 |     loaded = torch.load(pt_filename, map_location="cpu")
120 |     if "state_dict" in loaded:
121 |         loaded = loaded["state_dict"]
122 |     shared = shared_pointers(loaded)
123 |     for shared_weights in shared:
124 |         for name in shared_weights[1:]:
125 |             loaded.pop(name)
126 | 
127 |     # For tensors to be contiguous
128 |     loaded = {k: v.contiguous() for k, v in loaded.items()}
129 | 
130 |     dirname = os.path.dirname(sf_filename)
131 |     os.makedirs(dirname, exist_ok=True)
132 |     save_file(loaded, sf_filename, metadata={"format": "pt"})
133 |     check_file_size(sf_filename, pt_filename)
134 |     reloaded = load_file(sf_filename)
135 |     for k in loaded:
136 |         pt_tensor = loaded[k]
137 |         sf_tensor = reloaded[k]
138 |         if not torch.equal(pt_tensor, sf_tensor):
139 |             raise RuntimeError(f"The output tensors do not match for key {k}")
140 | 
141 | 
142 | def create_diff(pt_infos: Dict[str, List[str]], sf_infos: Dict[str, List[str]]) -> str:
143 |     errors = []
144 |     for key in ["missing_keys", "mismatched_keys", "unexpected_keys"]:
145 |         pt_set = set(pt_infos[key])
146 |         sf_set = set(sf_infos[key])
147 | 
148 |         pt_only = pt_set - sf_set
149 |         sf_only = sf_set - pt_set
150 | 
151 |         if pt_only:
152 |             errors.append(f"{key} : PT warnings contain {pt_only} which are not present in SF warnings")
153 |         if sf_only:
154 |             errors.append(f"{key} : SF warnings contain {sf_only} which are not present in PT warnings")
155 |     return "\n".join(errors)
156 | 
157 | 
158 | def check_final_model(model_id: str, folder: str, token: Optional[str]):
159 |     config = hf_hub_download(repo_id=model_id, filename="config.json", token=token, cache_dir=folder)
160 |     shutil.copy(config, os.path.join(folder, "config.json"))
161 |     config = AutoConfig.from_pretrained(folder)
162 | 
163 |     import transformers
164 | 
165 |     class_ = getattr(transformers, config.architectures[0])
166 |     with torch.device("meta"):
167 |         (pt_model, pt_infos) = class_.from_pretrained(folder, output_loading_info=True)
168 |         (sf_model, sf_infos) = class_.from_pretrained(folder, output_loading_info=True)
169 |     
170 |         if pt_infos != sf_infos:
171 |             error_string = create_diff(pt_infos, sf_infos)
172 |             raise ValueError(f"Different infos when reloading the model: {error_string}")
173 | 
174 |     #### XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
175 |     ####  SKIPPING THE REST OF THE test to save RAM
176 |     return
177 |     pt_params = pt_model.state_dict()
178 |     sf_params = sf_model.state_dict()
179 | 
180 |     pt_shared = shared_pointers(pt_params)
181 |     sf_shared = shared_pointers(sf_params)
182 |     if pt_shared != sf_shared:
183 |         raise RuntimeError("The reconstructed model is wrong, shared tensors are different {shared_pt} != {shared_tf}")
184 | 
185 |     sig = signature(pt_model.forward)
186 |     input_ids = torch.arange(10).unsqueeze(0)
187 |     pixel_values = torch.randn(1, 3, 224, 224)
188 |     input_values = torch.arange(1000).float().unsqueeze(0)
189 |     # Hardcoded for whisper basically
190 |     input_features = torch.zeros((1, 80, 3000))
191 |     kwargs = {}
192 |     if "input_ids" in sig.parameters:
193 |         kwargs["input_ids"] = input_ids
194 |     if "input_features" in sig.parameters:
195 |         kwargs["input_features"] = input_features
196 |     if "decoder_input_ids" in sig.parameters:
197 |         kwargs["decoder_input_ids"] = input_ids
198 |     if "pixel_values" in sig.parameters:
199 |         kwargs["pixel_values"] = pixel_values
200 |     if "input_values" in sig.parameters:
201 |         kwargs["input_values"] = input_values
202 |     if "bbox" in sig.parameters:
203 |         kwargs["bbox"] = torch.zeros((1, 10, 4)).long()
204 |     if "image" in sig.parameters:
205 |         kwargs["image"] = pixel_values
206 | 
207 |     if torch.cuda.is_available():
208 |         pt_model = pt_model.cuda()
209 |         sf_model = sf_model.cuda()
210 |         kwargs = {k: v.cuda() for k, v in kwargs.items()}
211 | 
212 |     try:
213 |         pt_logits = pt_model(**kwargs)[0]
214 |     except Exception as e:
215 |         try:
216 |             # Musicgen special exception.
217 |             decoder_input_ids = torch.ones((input_ids.shape[0] * pt_model.decoder.num_codebooks, 1), dtype=torch.long)
218 |             if torch.cuda.is_available():
219 |                 decoder_input_ids = decoder_input_ids.cuda()
220 | 
221 |             kwargs["decoder_input_ids"] = decoder_input_ids
222 |             pt_logits = pt_model(**kwargs)[0]
223 |         except Exception:
224 |             raise e
225 |     sf_logits = sf_model(**kwargs)[0]
226 | 
227 |     torch.testing.assert_close(sf_logits, pt_logits)
228 |     print(f"Model {model_id} is ok !")
229 | 
230 | 
231 | def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
232 |     try:
233 |         main_commit = api.list_repo_commits(model_id)[0].commit_id
234 |         discussions = api.get_repo_discussions(repo_id=model_id)
235 |     except Exception:
236 |         return None
237 |     for discussion in discussions:
238 |         if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
239 |             commits = api.list_repo_commits(model_id, revision=discussion.git_reference)
240 | 
241 |             if main_commit == commits[1].commit_id:
242 |                 return discussion
243 |     return None
244 | 
245 | 
246 | def convert_generic(model_id: str, folder: str, filenames: Set[str], token: Optional[str]) -> ConversionResult:
247 |     operations = []
248 |     errors = []
249 | 
250 |     extensions = set([".bin", ".ckpt"])
251 |     for filename in filenames:
252 |         prefix, ext = os.path.splitext(filename)
253 |         if ext in extensions:
254 |             pt_filename = hf_hub_download(model_id, filename=filename, token=token, cache_dir=folder)
255 |             dirname, raw_filename = os.path.split(filename)
256 |             if raw_filename == "pytorch_model.bin":
257 |                 # XXX: This is a special case to handle `transformers` and the
258 |                 # `transformers` part of the model which is actually loaded by `transformers`.
259 |                 sf_in_repo = os.path.join(dirname, "model.safetensors")
260 |             else:
261 |                 sf_in_repo = f"{prefix}.safetensors"
262 |             sf_filename = os.path.join(folder, sf_in_repo)
263 |             try:
264 |                 convert_file(pt_filename, sf_filename)
265 |                 operations.append(CommitOperationAdd(path_in_repo=sf_in_repo, path_or_fileobj=sf_filename))
266 |             except Exception as e:
267 |                 errors.append((pt_filename, e))
268 |     return operations, errors
269 | 
270 | 
271 | def convert(api: "HfApi", model_id: str, force: bool = False) -> Tuple["CommitInfo", List[Tuple[str, "Exception"]]]:
272 |     pr_title = "Adding `safetensors` variant of this model"
273 |     info = api.model_info(model_id)
274 |     filenames = set(s.rfilename for s in info.siblings)
275 | 
276 |     with TemporaryDirectory() as d:
277 |         folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
278 |         os.makedirs(folder)
279 |         new_pr = None
280 |         try:
281 |             operations = None
282 |             pr = previous_pr(api, model_id, pr_title)
283 | 
284 |             library_name = getattr(info, "library_name", None)
285 |             if any(filename.endswith(".safetensors") for filename in filenames) and not force:
286 |                 raise AlreadyExists(f"Model {model_id} is already converted, skipping..")
287 |             elif pr is not None and not force:
288 |                 url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
289 |                 new_pr = pr
290 |                 raise AlreadyExists(f"Model {model_id} already has an open PR check out {url}")
291 |             elif library_name == "transformers":
292 |                 if "pytorch_model.bin" in filenames:
293 |                     operations, errors = convert_single(model_id, folder, token=api.token)
294 |                 elif "pytorch_model.bin.index.json" in filenames:
295 |                     operations, errors = convert_multi(model_id, folder, token=api.token)
296 |                 else:
297 |                     raise RuntimeError(f"Model {model_id} doesn't seem to be a valid pytorch model. Cannot convert")
298 |                 check_final_model(model_id, folder, token=api.token)
299 |             else:
300 |                 operations, errors = convert_generic(model_id, folder, filenames, token=api.token)
301 | 
302 |             if operations:
303 |                 new_pr = api.create_commit(
304 |                     repo_id=model_id,
305 |                     operations=operations,
306 |                     commit_message=pr_title,
307 |                     commit_description=COMMIT_DESCRIPTION,
308 |                     create_pr=True,
309 |                 )
310 |                 print(f"Pr created at {new_pr.pr_url}")
311 |             else:
312 |                 print("No files to convert")
313 |         finally:
314 |             shutil.rmtree(folder)
315 |         return new_pr, errors
316 | 
317 | 
318 | if __name__ == "__main__":
319 |     DESCRIPTION = """
320 |     Simple utility tool to convert automatically some weights on the hub to `safetensors` format.
321 |     It is PyTorch exclusive for now.
322 |     It works by downloading the weights (PT), converting them locally, and uploading them back
323 |     as a PR on the hub.
324 |     """
325 |     parser = argparse.ArgumentParser(description=DESCRIPTION)
326 |     parser.add_argument(
327 |         "model_id",
328 |         type=str,
329 |         help="The name of the model on the hub to convert. E.g. `gpt2` or `facebook/wav2vec2-base-960h`",
330 |     )
331 |     parser.add_argument(
332 |         "--force",
333 |         action="store_true",
334 |         help="Create the PR even if it already exists of if the model was already converted.",
335 |     )
336 |     parser.add_argument(
337 |         "-y",
338 |         action="store_true",
339 |         help="Ignore safety prompt",
340 |     )
341 |     args = parser.parse_args()
342 |     model_id = args.model_id
343 |     api = HfApi()
344 |     if args.y:
345 |         txt = "y"
346 |     else:
347 |         txt = input(
348 |             "This conversion script will unpickle a pickled file, which is inherently unsafe. If you do not trust this file, we invite you to use"
349 |             " https://huggingface.co/spaces/safetensors/convert or google colab or other hosted solution to avoid potential issues with this file."
350 |             " Continue [Y/n] ?"
351 |         )
352 |     if txt.lower() in {"", "y"}:
353 |         try:
354 |             commit_info, errors = convert(api, model_id, force=args.force)
355 |             string = f"""
356 | ### Success 🔥
357 | Yay! This model was successfully converted and a PR was open using your token, here:
358 | [{commit_info.pr_url}]({commit_info.pr_url})
359 |             """
360 |             if errors:
361 |                 string += "\nErrors during conversion:\n"
362 |                 string += "\n".join(
363 |                     f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors
364 |                 )
365 |             print(string)
366 |         except Exception as e:
367 |             print(
368 |                 f"""
369 | ### Error 😢😢😢
370 | 
371 | {e}
372 |             """
373 |             )
374 |     else:
375 |         print(f"Answer was `{txt}` aborting.")


--------------------------------------------------------------------------------
/finetuning/utils/prompter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A dedicated helper to manage templates and prompt building.
  3 | 
  4 | Code adapted from the alpaca-lora repository at https://github.com/tloen/alpaca-lora/blob/main/utils/prompter.py
  5 | """
  6 | 
  7 | import json
  8 | import os.path as osp
  9 | from typing import Union
 10 | 
 11 | 
 12 | class Prompter(object):
 13 |     
 14 |     __slots__ = ("template", "_verbose", "pctile_test")
 15 | 
 16 |     def __init__(self, template_name: str = "", verbose: bool = False):
 17 |         self._verbose = verbose
 18 |         self.pctile_test = False
 19 |         if template_name == "code_opt_w_speedup_pctile_test":
 20 |             self.pctile_test = True
 21 |             template_name = "code_opt_w_speedup_pctile"
 22 |         if not template_name:
 23 |             # Enforce the default here, so the constructor can be called with '' and will not break.
 24 |             template_name = "code_opt"
 25 |         file_name = osp.join("templates", f"{template_name}.json")
 26 |         if not osp.exists(file_name):
 27 |             raise ValueError(f"Can't read {file_name}")
 28 |         with open(file_name) as fp:
 29 |             self.template = json.load(fp)
 30 |         if self._verbose:
 31 |             print(
 32 |                 f"Using prompt template {template_name}: {self.template['description']}"
 33 |             )
 34 | 
 35 |         print(f"template_name: {template_name}")
 36 |         print(f"pcitle_test: {self.pctile_test}")
 37 | 
 38 |     def generate_prompt(
 39 |         self,
 40 |         src_code: str,
 41 |         tgt_code: Union[None, str] = None,
 42 |         speedup_desc: Union[None, str] = None,
 43 |         speedup_bin: Union[None, str] = None,
 44 |         pctile: Union[None, str] = None,
 45 |         code_cutoff: int = 1500,
 46 |     ) -> str:
 47 |         # returns the full prompt from src_code and optional input
 48 |         # if a tgt_code (=response, =output) is provided, it's also appended.
 49 | 
 50 |         # take first 1500 chars of src_code and tgt_code to make sure the prompt is not too long
 51 |         src_code = src_code[:code_cutoff]
 52 | 
 53 |         if speedup_desc and speedup_bin:
 54 |             raise ValueError("Both speedup_desc and speedup_bin can mot be set.")
 55 |         
 56 |         if tgt_code:
 57 |             tgt_code = tgt_code[:code_cutoff]
 58 |             
 59 |         if speedup_desc:
 60 |             try:
 61 |                 res = self.template["prompt_no_input"].format(
 62 |                     src_code=src_code,
 63 |                     speedup_desc=speedup_desc
 64 |                 )
 65 |             except Exception as e:
 66 |                 print("Oops! There is no speedup_desc in the template prompt!")
 67 |         elif speedup_bin:
 68 |             try:
 69 |                 res = self.template["prompt_no_input"].format(
 70 |                     src_code=src_code,
 71 |                     speedup_bin=speedup_bin
 72 |                 )
 73 |             except Exception as e:
 74 |                 print("Oops! There is no speedup_bin in the template prompt!")
 75 |         elif pctile: 
 76 |             try: 
 77 |                 res = self.template["prompt_no_input"].format(
 78 |                     src_code=src_code,
 79 |                     pctile=pctile
 80 |                 )
 81 |             except Exception as e:
 82 |                 print("Oops! There is no pctile in the template prompt!")
 83 |         elif self.pctile_test: # test time
 84 |             try:
 85 |                 res = self.template["prompt_no_input"].format(
 86 |                     src_code=src_code,
 87 |                     pctile="10"
 88 |                 )
 89 |             except Exception as e:
 90 |                 print("Oops! There is no pctile in the template prompt!")
 91 |         else: # only src_code
 92 |             try:
 93 |                 res = self.template["prompt_no_input"].format(
 94 |                     src_code=src_code
 95 |                 )
 96 |             except Exception as e:
 97 |                 print("Oops! There is no src_code in the template prompt!")
 98 |             
 99 |         if tgt_code:
100 |             res = f"{res}{tgt_code}"
101 |         
102 |         if self._verbose:
103 |             print(res)
104 |         return res
105 | 
106 |     def get_response(self, output: str) -> str:
107 |         return output.split(self.template["response_split"])[1].strip()
108 | 


--------------------------------------------------------------------------------
/gem5/README.md:
--------------------------------------------------------------------------------
 1 | # Gem5 Simulator for PIE
 2 | 
 3 | ## Overview
 4 | 
 5 | This subdirectory contains the `gem5` module, which we use to interface with the `gem5` simulator. The `gem5` simulator is a full systema and CPU simulator that can be used to simulate the execution of a program on a computer system. We use `gem5` to simulate the execution of the programs in a determinstic and reproducible manner. 
 6 | 
 7 | For our experiments, we use a simulated CPU of the Intel Skylake CPU.
 8 | We provide an easy-to-use docker image and API that can be used to reproduce our results and for other researchers to continue to use for program optimization research.
 9 | 
10 | Building the environment is similar to the [gym](https://github.com/Farama-Foundation/Gymnasium) API for reinforcement learning. After importing the module and running make, the docker image should automatically be pulled on the first iteration and a container created. The environment then provides a convenient abstraction for interacting with the environment. 
11 | 
12 | Results from our experiments can be located in [this google drive folder](https://drive.google.com/drive/folders/1criq4bpLlIaINzhjUAB18NZwDtEkk0Rj?usp=sharing). 
13 | 
14 | gem5
15 | 
16 | ## Usage 
17 | \***********************************************************************************************************************************
18 | 
19 | **Note that in order to use the module and its container for simulation, your architecture will need to be either x86-64 or Amd64** 
20 | 
21 | \***********************************************************************************************************************************
22 | 
23 | First you need to configure the pie project as part of your python path. You can do this by running the following command from the root of the pie project:
24 | 
25 | ```bash
26 | export PYTHONPATH=$PYTHONPATH:$(pwd)
27 | ```
28 | 
29 | On your system you will need to have docker installed. The module works using the Docker Python SDK and is designed to abstract away all the hassle of pulling the container and configuring the gem5 simulator. We have designed it to reflect the OpenAI Gym API, so it should be easy to use for anyone familiar with that.
30 | 
31 | ```python
32 | 
33 | from gem5 import simulator 
34 | env = simulator.make(...) 
35 | results = env.submit_multiple_single_submissions(...)
36 | 
37 | ```
38 | 
39 | In order to get started you will need the simulator.make() function to create an environment object which you can then use to submit to the simulator backend. 
40 | 
41 | #### Key Arguments for simulator.make()
42 | 
43 | - `arch`: The architecture to use. Currently only 'X86-skylake' is supported.
44 | - `cpuset_cpus`: The cpus to use. If not specified, all cpus are used.
45 | - `workers`: The number of workers to use. If not specified, all cpus are used.
46 | - `gem5_acc_threshold`: If the functional accuracy is below this threshold, we skip any benchmarking and return the result early. 
47 | - `port`: The port to use for communication.
48 | - `optimization_flag`: The GCC optimization flag to use for compilation, for our work we used '-O3'.
49 | - `cpu_type`: The type of CPU configuration to use. For our work we used 'Verbatim' from the skylake configuration used. 
50 | - `timeout_seconds_gem5`: The timeout in seconds for the gem5 simulator, for our work we used 120 seconds for evaluation. 
51 | - `verbose`: We highly recommend setting this to True to monitor the progress of the gem5 simulator.
52 | - `exit_early_on_fail`: If True, we exit early if any individual test case times out or encounters a runtime error, we highly recommend this to be set to True for speeding things up if you're only evaluating, as we that would not contribute to any speedups. 
53 | 
54 | #### Key Arguments for env.submit_multiple_single_submissions()
55 | 
56 | - `code_list`: A list of strings, each string is the code of a single submission.
57 | - `testcases_list`: Each sublist consists of the test cases used for benchmarking the corresponding code: these are the integer indices of the test cases in the test case pool.
58 | - `problem_id_list`: A list of strings, each string is the problem id for the corresponding code.
59 | - `timing_env`: The timing environment to use: currently only 'gem5' is supported, we have prototype support for hardware based benchmarking on your machine using 'hyperfine' or 'both' but the 'hyperfine' support is not fully implemented yet. 
60 | 
61 | ## Evaluation Script
62 | 
63 | The evaluation driver is located in `gem5/gem5_eval.py`. This script requires a yaml configuration file to be passed in as an argument to `--config_path`. Example usage from the project directory would be: 
64 | 
65 | ```bash
66 | export PYTHONPATH=$PYTHONPATH:$(pwd)
67 | python gem5/gem5_eval.py --config_path PATH_TO_EXPERIMENT_CONFIG.yaml
68 | ```
69 | 
70 | The yaml configuration file should contain at least the following fields:
71 | 
72 | - `model_generated_outputs_path`: The path to the model generated outputs. This should be a `.jsonl` file containing the model generated outputs in addition to all other metadata in the test set file. 
73 | - `output_dir`: The directory to output the results to.
74 | - `reference_file_path`: The path to the reference file. This should be the reference `.jsonl` file containing the reference outputs in addition to all other metadata in the test set file.
75 | - `model_generated_potentially_faster_code_col`: The column in the model generated outputs that contains the model's generations of potentially faster code. We've used "generated_answers" as a default.
76 | 
77 | An example is provided in [gem5/template_config.yaml](template_config.yaml).
78 | 


--------------------------------------------------------------------------------
/gem5/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LearningOpt/pie/eb3d72bc4f3c9095d0f62506340bc8ca0ef27b09/gem5/__init__.py


--------------------------------------------------------------------------------
/gem5/api_pytest.py:
--------------------------------------------------------------------------------
  1 | import benchmarking
  2 | import tempfile 
  3 | import subprocess 
  4 | import os 
  5 | import glob
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from collections import defaultdict
  9 | 
 10 | count_to_10_cpp = """
 11 | #include 
 12 | using namespace std;
 13 | 
 14 | int main() {
 15 |     for (int i = 0; i < 10; i++) {
 16 |         cout << i << endl;
 17 |     }
 18 |     return 0;
 19 | }
 20 | """ 
 21 | 
 22 | mult_in_by_2_cpp = """
 23 | #include 
 24 | using namespace std;
 25 | 
 26 | int main() {
 27 |     int x;
 28 |     cin >> x;
 29 |     cout << x * 2 << endl;
 30 |     return 0;
 31 | }
 32 | """
 33 | 
 34 | example_1_code = """
 35 | #include 
 36 | #define REP(i, n) for (int i = 0; i < (n); i++)
 37 | using namespace std;
 38 | const int MOD = 998244353;
 39 | 
 40 | int main() {
 41 | 	cin.tie(0)->sync_with_stdio(false);
 42 | 
 43 | 	int n, k; cin >> n >> k;
 44 | 	vector l(k), r(k);
 45 | 	REP(i, k) cin >> l[i] >> r[i];
 46 | 	REP(i, k) r[i]++;
 47 | 
 48 | 	vector dp(n + 1, 0);
 49 | 	dp[0] = 1;
 50 | 	dp[1] = -1;
 51 | 	REP(i, n) {
 52 | 		if (i > 0)
 53 | 			dp[i] = (dp[i] + dp[i - 1]) % MOD;
 54 | 		REP(j, k) {
 55 | 			if (i + l[j] < n)
 56 | 				dp[i + l[j]] = (dp[i + l[j]] + dp[i]) % MOD;
 57 | 			if (i + r[j] < n)
 58 | 				dp[i + r[j]] = (((dp[i + r[j]] - dp[i]) % MOD) + MOD) % MOD;
 59 | 		}
 60 | 	}
 61 | 	cout << dp[n - 1] << endl;
 62 | 	return 0;
 63 | }
 64 | """
 65 | example_1_problem_id = "p02549"
 66 | 
 67 | example_hello_world_code = """
 68 | #include 
 69 | 
 70 | int main() {
 71 |     std::cout << "Hello, World!" << std::endl;
 72 |     return 0;
 73 | }
 74 | """
 75 | 
 76 | # def exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout):
 77 | #     logging.info(f'executing {bin_path}, with input {in_path}')
 78 | #     with open(in_path, 'r') as fh:
 79 | #         p = subprocess.run([bin_path], capture_output=True, timeout=timeout, stdin=fh, text=True)
 80 | #     if p.returncode != 0:
 81 | #         raise Exception(f"Error executing code: {bin_path}, return code: {p.returncode}, stderr: {p.stderr.decode('utf-8')}")
 82 | #     else: 
 83 | #         return get_accuracy(p.stdout, ground_truth_output)
 84 | 
 85 | 
 86 | class TestBenchmarking: 
 87 |     def test_compile(self): 
 88 |         with tempfile.TemporaryDirectory() as tmpdir:
 89 |             code_path = os.path.join(tmpdir, "basic.cpp")
 90 |             with open(code_path, "w") as f:
 91 |                 f.write(count_to_10_cpp)
 92 |             output_path = benchmarking.compile_cpp_code(code_path)
 93 |             p = subprocess.run([output_path], capture_output=True, text=True)
 94 |             assert p.returncode == 0
 95 |             assert p.stdout.strip() == "\n".join([str(i) for i in range(10)])
 96 |             assert os.path.exists(output_path)
 97 |             assert os.path.getsize(output_path) > 0
 98 |             
 99 |     def test_exec_bin(self): 
100 |         with tempfile.TemporaryDirectory() as tmpdir:
101 |             code_path = os.path.join(tmpdir, "basic.cpp")
102 |             with open(code_path, "w") as f:
103 |                 f.write(count_to_10_cpp)
104 |             output_path = benchmarking.compile_cpp_code(code_path)
105 |             rc, stdout, stderr = benchmarking.exec_bin(output_path, None, None)
106 |             assert rc == 0
107 |             assert stdout.strip() == "\n".join([str(i) for i in range(10)])
108 |             assert stderr == ""
109 |             
110 |     def test_exec_bin_input(self):
111 |         with tempfile.TemporaryDirectory() as tmpdir:
112 |             code_path = os.path.join(tmpdir, "basic.cpp")
113 |             input_path = os.path.join(tmpdir, "input.txt")
114 |             with open(code_path, "w") as f:
115 |                 f.write(mult_in_by_2_cpp)
116 |             with open(input_path, "w") as f:
117 |                 f.write("2")
118 |             output_path = benchmarking.compile_cpp_code(code_path)
119 |             rc, stdout, stderr = benchmarking.exec_bin(output_path, input_path, None)
120 |             assert rc == 0
121 |             assert stdout.strip() == "4"
122 |             assert stderr == ""
123 |             
124 |     def test_exec_bin_for_acc(self):    
125 |          with tempfile.TemporaryDirectory() as tmpdir:
126 |             code_path = os.path.join(tmpdir, "basic.cpp")
127 |             input_path = os.path.join(tmpdir, "input.txt")
128 |             with open(code_path, "w") as f:
129 |                 f.write(mult_in_by_2_cpp)
130 |             with open(input_path, "w") as f:
131 |                 f.write("2")
132 |             output_path = benchmarking.compile_cpp_code(code_path)
133 |             acc_correct = benchmarking.exec_bin_for_acc(output_path, input_path, "4", None)
134 |             acc_incorrect = benchmarking.exec_bin_for_acc(output_path, input_path, "5", None)
135 |             assert acc_correct == 1
136 |             assert acc_incorrect == 0
137 |             
138 |     def test_compile_and_check_outputs(self): 
139 |         with tempfile.TemporaryDirectory() as tempdir: 
140 |             code_path = os.path.join(tempdir, "basic.cpp")
141 |             with open(code_path, "w") as fh: 
142 |                 fh.write(example_1_code)
143 |             bin_path, accs = benchmarking.compile_and_check_outputs(
144 |                 code_path=code_path, 
145 |                 problem_id=example_1_problem_id, 
146 |                 testcases_dir="/home/pie-perf/data/codenet/merged_test_cases/"
147 |             )
148 |             print(f"bin_path: {bin_path}")
149 |             assert os.path.exists(bin_path)
150 |             assert os.path.getsize(bin_path) > 0
151 |         assert np.mean(list(accs.values())) == 1.0
152 |         assert np.std(list(accs.values())) == 0.0
153 |         n_testcases = len(glob.glob(os.path.join("/home/pie-perf/data/codenet/merged_test_cases/", example_1_problem_id, "input.*.txt")))
154 |         assert len(accs) == n_testcases
155 |         
156 |     def test_exec_gem5(self):
157 |         sim_seconds = []
158 |         sim_seconds_precise = []
159 |         for _ in tqdm(range(5)):
160 |             with tempfile.TemporaryDirectory() as tmpdir:
161 |                 code_path = os.path.join(tmpdir, "basic.cpp")
162 |                 with open(code_path, "w") as f:
163 |                     f.write(example_hello_world_code)
164 |                 output_path = benchmarking.compile_cpp_code(code_path, cflags="--std=c++17 -O3")
165 |                 rc, stdout, stderr = benchmarking.exec_gem5(
166 |                     gem5_dir="/home/gem5/build/X86/", 
167 |                     gem5_script_path="/home/gem5-skylake-config/gem5-configs/run-se.py", 
168 |                     cpu_type="Verbatim",
169 |                     bin_path=output_path,
170 |                     in_path=None,
171 |                     stats_out_path=os.path.join(tmpdir, "stats.txt"),
172 |                     timeout=60, 
173 |                     cpu_number=0)
174 |                 
175 |                 assert rc == 0
176 |                 stats = benchmarking.parse_stats_txt(os.path.join(tmpdir, "stats.txt"))
177 |                 sim_seconds.append(stats["sim_seconds"])
178 |                 sim_seconds_precise.append(stats["sim_seconds_precise"])
179 |         print(f"sim_seconds: {sim_seconds}")
180 |         print(f"sim_seconds_precise: {sim_seconds_precise}")
181 |         assert np.isclose(np.mean(sim_seconds), 0.001004, atol=1e-5)
182 |         assert np.isclose(np.mean(sim_seconds_precise), 0.001004, atol=1e-5)
183 |         assert all(sim_seconds_precise[i] == 0.001004121118 for i in range(len(sim_seconds_precise)))
184 | 
185 |     def test_run_gem5(self): 
186 |         sim_seconds_0 = []
187 |         sim_seconds_1 = []
188 |         for _ in tqdm(range(2)): 
189 |             with tempfile.TemporaryDirectory() as tmpdir:
190 |                 code_path = os.path.join(tmpdir, "code.cpp")
191 |                 with open(code_path, "w") as f:
192 |                     f.write(example_1_code)
193 |                 bin_path = benchmarking.compile_cpp_code(code_path)
194 |                 tc_2_results = benchmarking.run_gem5(
195 |                     gem5_dir="/home/gem5/build/X86/", 
196 |                     gem5_script_path="/home/gem5-skylake-config/gem5-configs/run-se.py", 
197 |                     cpu_type="Verbatim",
198 |                     bin_path=bin_path, 
199 |                     problem_id=example_1_problem_id, 
200 |                     testcases_dir="/home/pie-perf/data/codenet/merged_test_cases/", 
201 |                     testcases=[0,1], 
202 |                     timeout=30, 
203 |                     cpu_number=0
204 |                 )
205 |                 assert tc_2_results[0]["success"] == True 
206 |                 assert tc_2_results[1]["success"] == True 
207 |                 assert len(tc_2_results) == 2 
208 |                 sim_seconds_0.append(tc_2_results[0]["stats"]["sim_seconds_precise"])
209 |                 sim_seconds_1.append(tc_2_results[1]["stats"]["sim_seconds_precise"])
210 |         print(f"sim_seconds for tc 0 {sim_seconds_0}")
211 |         print(f"sim_seconds for tc 1 {sim_seconds_1}")
212 |         assert sim_seconds_0[0] == sim_seconds_0[1] == 0.001035073468
213 |         assert sim_seconds_1[0] == sim_seconds_1[1] == 0.001039205596
214 |     
215 | 
216 |     def test_run_hyperfine(self):
217 |         tc2times = defaultdict(list)
218 |         for _ in range(2):
219 |             with tempfile.TemporaryDirectory() as tmpdir:
220 |                     code_path = os.path.join(tmpdir, "code.cpp")
221 |                     with open(code_path, "w") as f:
222 |                         f.write(example_1_code)
223 |                     code2results, output = benchmarking.run_hyperfine(
224 |                         code_paths=[code_path],
225 |                         problem_ids=[example_1_problem_id],
226 |                         path_to_testcases="/home/pie-perf/data/codenet/merged_test_cases/",
227 |                         json_out_path=os.path.join(tmpdir, "results.json"),
228 |                         test_cases_list=[[i for i in range(10)]], 
229 |                         min_runs_per_test_case=10, 
230 |                         max_runs_per_test_case=500, 
231 |                         strict_runs_per_test_case=False,
232 |                         warmup_runs_per_test_case=5,
233 |                         cpu_number=0,
234 |                         do_sanity_check=True)
235 |                     for tc, results in code2results[code_path].items():
236 |                         tc2times[tc].append(np.array(results["times"]))
237 |         for tc, times in tc2times.items():
238 |             mean_times = []
239 |             for time_list in times:
240 |                 mean_times.append(np.mean(time_list))
241 |             assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
242 |             print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
243 |         assert len(tc2times) == 10
244 |         
245 |     def test_run_hyperfine_strict(self):
246 |         tc2times = defaultdict(list)
247 |         for _ in range(2):
248 |             with tempfile.TemporaryDirectory() as tmpdir:
249 |                     code_path = os.path.join(tmpdir, "code.cpp")
250 |                     with open(code_path, "w") as f:
251 |                         f.write(example_1_code)
252 |                     code2results, output = benchmarking.run_hyperfine(
253 |                         code_paths=[code_path],
254 |                         problem_ids=[example_1_problem_id],
255 |                         path_to_testcases="/home/pie-perf/data/codenet/merged_test_cases/",
256 |                         json_out_path=os.path.join(tmpdir, "results.json"),
257 |                         test_cases_list=None, 
258 |                         min_runs_per_test_case=100, 
259 |                         max_runs_per_test_case=None, 
260 |                         strict_runs_per_test_case=True,
261 |                         warmup_runs_per_test_case=5,
262 |                         cpu_number=0,
263 |                         do_sanity_check=True)
264 |                     for tc, results in code2results[code_path].items():
265 |                         tc2times[tc].append(np.array(results["times"]))
266 |         for tc, times in tc2times.items():
267 |             assert len(times) == 2
268 |             mean_times = []
269 |             for time_list in times:
270 |                 assert len(time_list) == 100
271 |                 mean_times.append(np.mean(time_list))
272 |             assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
273 |             print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
274 |         assert len(tc2times) == len(glob.glob(f"/home/pie-perf/data/codenet/merged_test_cases/{example_1_problem_id}/input*"))
275 |             
276 |             
277 |                 
278 | 
279 |             
280 |             
281 |     
282 |     
283 |         


--------------------------------------------------------------------------------
/gem5/benchmarking.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | import shutil
  4 | import os
  5 | import warnings
  6 | import traceback
  7 | import logging
  8 | import subprocess
  9 | import glob
 10 | import re
 11 | import traceback
 12 | import time
 13 | import shlex
 14 | from typing import Optional, List, Tuple, Dict, Any, Union
 15 | import multiprocessing
 16 | from collections import defaultdict
 17 | import json 
 18 | import resource
 19 | import re
 20 | import ast
 21 | from dataclasses import dataclass
 22 | 
 23 | logging.basicConfig(level=logging.DEBUG)
 24 | logging.getLogger("resource").setLevel(logging.DEBUG)
 25 | 
 26 | MAX_VIRTUAL_MEMORY = 10 * 1024 * 1024 * 50  # 500 MB
 27 | 
 28 | # from https://gist.github.com/s3rvac/f97d6cbdfdb15c0a32e7e941f7f4a3fa
 29 | def limit_virtual_memory():
 30 |     resource.setrlimit(resource.RLIMIT_AS, (MAX_VIRTUAL_MEMORY, MAX_VIRTUAL_MEMORY * 10))
 31 |     
 32 |     
 33 | def get_accuracy(output: str, ground_truth: str) -> float:
 34 |     """
 35 |     Compare the output of the code with the ground truth.
 36 |     """
 37 |     num_correct = 0
 38 |     ground_truth_lines = ground_truth.strip().splitlines()
 39 |     output_truth_lines = output.strip().splitlines()
 40 |     for gen_output, ground_truth_output in zip(output_truth_lines, ground_truth_lines):
 41 |         is_corr = gen_output == ground_truth_output
 42 |         if not is_corr:
 43 |             try:
 44 |                 gen_output = float(gen_output)
 45 |                 ground_truth_output = float(ground_truth_output)
 46 |                 is_corr = abs(gen_output - ground_truth_output) < 1e-3
 47 |             except:
 48 |                 pass
 49 |         num_correct += int(is_corr)
 50 | 
 51 |     return num_correct / len(ground_truth_lines)
 52 | 
 53 | def compile_cpp_code(code_path: str, timeout: int = 30, output_path: str = None, cflags: str = "--std=c++17 -O3", cpu_number: Optional[int] = None) -> str:
 54 |     """_summary_
 55 | 
 56 |     Args:
 57 |         code_path (str): _description_
 58 |         output_path (str, optional): _description_
 59 |         cflags (str, optional): _description_
 60 |     
 61 |     Returns:
 62 |         str: _description_
 63 |     """
 64 |     if output_path is None:
 65 |         output_path = os.path.join(os.path.dirname(code_path), f"{os.path.splitext(os.path.basename(code_path))[0]}.out")
 66 |     cpu_cmd = f"taskset --cpu-list {cpu_number}" if cpu_number is not None else ""
 67 |         
 68 |     cmd = shlex.split(cpu_cmd) + ["/usr/bin/g++", code_path, "-o", output_path] + shlex.split(cflags.replace('"', "").replace("'", ""))
 69 |     logging.critical(f"Running command: {' '.join(cmd)}")
 70 |     p = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
 71 |     if p.returncode != 0:
 72 |         raise Exception(f"Error compiling code: {code_path} with command: {' '.join(cmd)}, return code: {p.returncode}, stderr: {p.stderr}")
 73 |     else: 
 74 |         # sometimes there can be latency in the file system, so we wait a bit
 75 |         while(not os.path.exists(output_path)):
 76 |             time.sleep(0.05)
 77 |     return output_path
 78 | 
 79 | def exec_bin(bin_path, in_path, timeout, cpu_number=None):
 80 |     logging.info(f'executing {bin_path}, with input {in_path}')
 81 |     if in_path is not None:
 82 |         fh = open(in_path, 'r')
 83 |     else: 
 84 |         fh = subprocess.DEVNULL
 85 |     cmd = [bin_path]
 86 |     if cpu_number is not None:
 87 |         cmd = ["taskset", "--cpu-list", str(cpu_number)] + cmd
 88 |     p = subprocess.run(cmd, capture_output=True, timeout=timeout, stdin=fh, text=True)
 89 |     if in_path is not None:
 90 |         fh.close()
 91 |     return p.returncode, p.stdout, p.stderr
 92 | 
 93 | def exec_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, in_path, stats_out_path, timeout: str = None, cpu_number=None):
 94 |     gem5_bin = os.path.join(gem5_dir, 'gem5.opt')
 95 |     cmd = shlex.split(f"{gem5_bin} --stats-file={stats_out_path} {gem5_script_path} {cpu_type} {bin_path}")
 96 |     if cpu_number is not None:
 97 |         cmd = ["taskset", "--cpu-list", str(cpu_number)] + cmd
 98 |     if in_path is not None:
 99 |         logging.info(f'executing {" ".join(cmd)}, with input {in_path}')
100 |         with open(in_path, 'r') as fh:
101 |             p = subprocess.run(cmd, capture_output=True, timeout=timeout, stdin=fh, text=True)
102 |     else: 
103 |         logging.info(f'executing {" ".join(cmd)}, with no input')
104 |         p = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
105 |     return p.returncode, p.stdout, p.stderr
106 |     
107 | def exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout=None):
108 |     logging.info(f'executing {bin_path}, with input {in_path}')
109 |     with open(in_path, 'r') as fh:
110 |         p = subprocess.run([bin_path], capture_output=True, timeout=timeout, stdin=fh, text=True)
111 |     if p.returncode != 0:
112 |         raise Exception(f"Error executing code: {bin_path}, return code: {p.returncode}, stderr: {p.stderr.decode('utf-8')}")
113 |     else: 
114 |         return get_accuracy(p.stdout, ground_truth_output)
115 |     
116 | def compile_and_check_outputs(code_path, problem_id, testcases_dir, timeout=None, cflags: str ="--std=c++17 -O3", testcases: List[int] = None, cpu_number=None):
117 |     
118 |     input_output_pairs = {}
119 |     input_paths = glob.glob(os.path.join(testcases_dir, problem_id, f"input.*.txt"))
120 |     for in_path in input_paths:
121 |         tc_no = re.search(r"input\.(\d+)\.txt", in_path).group(1)
122 |         if testcases is not None and int(tc_no) not in testcases and tc_no not in testcases: # allow both int and str
123 |             continue
124 |         out_path = os.path.join(testcases_dir, problem_id, f"output.{tc_no}.txt")
125 |         input_output_pairs[tc_no] = (in_path, out_path)
126 |     logging.info(f"Found {len(input_output_pairs)} testcases for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
127 |     try: 
128 |         bin_path = compile_cpp_code(code_path, timeout, cflags=cflags, cpu_number=cpu_number)
129 |         logging.info(f"Compiled {code_path} to {bin_path}")
130 |     except Exception as e:
131 |         return None, {tc_no: 0 for tc_no in input_output_pairs.keys()}
132 |     
133 |     accs = {}    
134 |     
135 |     for tc_no, (in_path, out_path) in input_output_pairs.items():
136 |         with open(out_path, 'r') as fh:
137 |             ground_truth_output = fh.read().strip()
138 |         try:
139 |             acc = exec_bin_for_acc(bin_path, in_path, ground_truth_output, timeout)
140 |             accs[tc_no] = acc
141 |         except Exception as e:
142 |             logging.error(f"Error executing code: {bin_path} with input: {in_path}, error: {e}")
143 |             accs[tc_no] = 0
144 |             
145 |     logging.info(f"bin_path: {bin_path}, accs: {accs}")
146 |             
147 |     return bin_path, accs
148 | 
149 | def compile_and_check_outputs_multi(
150 |     code_paths, 
151 |     problem_ids, 
152 |     testcases_dir,
153 |     timeout=None,
154 |     cflags: str ="--std=c++17 -O3",
155 |     test_cases_list = None,
156 |     cpu_number=None): 
157 |     if test_cases_list is None:
158 |         test_cases_list = [None for _ in range(len(code_paths))]
159 |     code2results = defaultdict(dict)
160 |     for code_path, problem_id, test_cases in zip(code_paths, problem_ids, test_cases_list):
161 |         bin_path, accs = compile_and_check_outputs(code_path, problem_id, testcases_dir, timeout, cflags, test_cases, cpu_number)
162 |         code2results[code_path]["compile_success"] = bin_path is not None
163 |         code2results[code_path]["bin_path"] = bin_path
164 |         code2results[code_path]["accs"] = accs
165 |     return code2results
166 | 
167 | 
168 | def calc_sim_seconds(stats):
169 |     return float(stats["sim_ticks"]) / float(stats["sim_freq"]) # more accurate than sim_seconds
170 | 
171 | 
172 | def parse_stats_txt(stats_path):
173 |     with open(stats_path, 'r') as f:
174 |         stats_lines = f.readlines()
175 |     
176 |     stats = {}
177 |     for line in stats_lines:
178 |         if line.strip() == '':
179 |             continue
180 |         if "Begin" in line:
181 |             continue
182 |         if "End" in line:
183 |             continue
184 |         line = re.sub("#.*", "", line).strip() # remove comments
185 |         parts = line.split()
186 |         parts = [part.strip() for part in parts]
187 |         if len(parts) > 2: 
188 |             value = parts[1:]
189 |         elif len(parts) == 2:
190 |             value = parts[1]
191 |         else: 
192 |             logging.warn(f'could not parse line {line}')
193 |             continue
194 |         key = parts[0]
195 |         if isinstance(value, str): 
196 |             try: 
197 |                 value = value.replace("%", "").replace("nan", "None").replace("inf", "None").replace("-inf", "None")
198 |                 value = ast.literal_eval(value) if value != "None" else None
199 |             except:
200 |                 logging.warn(f"could not parse value {value} for key {key}")
201 |         elif isinstance(value, list):
202 |             try: 
203 |                 value = [v.replace("%", "").replace("nan", "None").replace("inf", "None").replace("-inf", "None") for v in value]
204 |                 value = [ast.literal_eval(v) if v != "None" else None for v in value]
205 |             except:
206 |                 logging.warn(f"could not parse value {value} for key {key}")
207 |         stats[key] = value
208 |     stats["sim_seconds_precise"] = calc_sim_seconds(stats)
209 |     return stats
210 |      
211 | 
212 | def run_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, problem_id, testcases_dir, timeout, testcases: List[int] = None, cpu_number=None, exit_early_on_fail=True):
213 |     input_paths = glob.glob(os.path.join(testcases_dir, problem_id, f"input.*.txt"))
214 |     tc_2_in_path = {}
215 |     logging.info(f"Found {len(input_paths)} total testcases for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
216 |     for in_path in input_paths:
217 |         tc_no = int(re.search(r"input\.(\d+)\.txt", in_path).group(1))
218 |         if testcases is not None and str(tc_no) not in testcases and tc_no not in testcases:
219 |             continue
220 |         tc_2_in_path[tc_no] = in_path
221 |     logging.info(f"Found {len(tc_2_in_path)} testcases to actually run for problem: {problem_id} in testcases_dir: {testcases_dir} with testcases: {testcases}")
222 |     tc_2_results = {}
223 |     any_incorrect_or_timeout = False
224 |     logging.critical(f"Running {bin_path} on testcases: {tc_2_in_path.keys()}")
225 |     for tc_no, in_path in tc_2_in_path.items():
226 |         # logging.critical(f"Running {bin_path} on testcase {tc_no} with input {in_path}")
227 |         #### TOOD: MAKE SURE ALL CODE/BINARIES ARE IN UNIQUE DIRECTORIES
228 |         stats_out_path = os.path.splitext(bin_path)[0] + f".{tc_no}.txt"
229 |         if exit_early_on_fail and any_incorrect_or_timeout:
230 |             tc_2_results[tc_no] = {"success": False, "error": "Previous testcase was incorrect or timed out, so skipping this testcase",
231 |                                    "stats": None, "stdout": None, "stderr": None, "time": None} 
232 |         else: 
233 |             try: 
234 |                 returncode, stdout, stderr = exec_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, in_path, stats_out_path, timeout, cpu_number=cpu_number)
235 |                 if returncode != 0:
236 |                     tc_2_results[tc_no] = {"success": False, "error": f"Error executing code: {bin_path}, return code: {returncode}, stderr: {stderr.decode('utf-8')}", 
237 |                                         "stats": None, "stdout": stdout, "stderr": stderr, "time": None}
238 |                     any_incorrect_or_timeout = True
239 |                 else: 
240 |                     tc_2_results[tc_no] = {"success": True, "error": None, "stats": parse_stats_txt(stats_out_path), "stdout": stdout, "stderr": stderr, "time": parse_stats_txt(stats_out_path)["sim_seconds_precise"]}
241 |             except Exception as e:
242 |                 traceback_err = traceback.format_exc()
243 |                 tc_2_results[tc_no] = {"success": False, "error": f"Error executing code: {bin_path}, error: {e}, traceback: {traceback_err}", 
244 |                                         "stats": None, "stdout": None, "stderr": None, "time": None}
245 |                 any_incorrect_or_timeout = True
246 |     return tc_2_results     
247 | 
248 | 
249 | def run_gem5_multi(gem5_dir, gem5_script_path, cpu_type, bin_paths, problem_ids, testcases_dir, timeout, test_cases_list: List[int] = None, cpu_number=None, exit_early_on_fail=True):
250 |     if test_cases_list is None:
251 |         test_cases_list = [None for _ in range(len(bin_paths))]
252 |     bin2results = defaultdict(dict)
253 |     for bin_path, problem_id, test_cases in zip(bin_paths, problem_ids, test_cases_list):
254 |         bin2results[bin_path] = run_gem5(gem5_dir, gem5_script_path, cpu_type, bin_path, problem_id, testcases_dir, timeout, test_cases, cpu_number, exit_early_on_fail)
255 |     return bin2results
256 | 
257 | #### hyperfine
258 | 
259 | FSTREAM_HEADER="#include " # for redirecting io
260 | 
261 | CPP_HEADERS=[FSTREAM_HEADER]
262 | 
263 | def make_redirect_io_cpp(testcase_path, output_path=None): 
264 |     lines = f"\nstd::ifstream cin(\"{testcase_path}\");\n"
265 |     if output_path: 
266 |         lines = lines + f"std::ofstream cout(\"{output_path}\");\n\n"
267 |     return lines
268 | 
269 | def add_headers_cpp(code_str): 
270 |     for header in CPP_HEADERS:
271 |         if header not in code_str:
272 |             code_str = header + "\n" + code_str    
273 |     return code_str
274 | 
275 | 
276 | def insert_io_redirects_cpp(code_str, path_to_testcases, path_to_outputs=None): 
277 |     import re
278 |     ## match all whitespace after main and include that in the match greedy
279 |     m = re.search("main(\s*)[^\{}]*{", code_str)
280 |     if m is None:
281 |         raise ValueError("No main function found")
282 |     insert_idx = m.end()
283 |     io_redirects = make_redirect_io_cpp(path_to_testcases, path_to_outputs)
284 |     return code_str[:insert_idx] + io_redirects + code_str[insert_idx:]
285 | 
286 | 
287 | def redirect_cpp_io(code_str, path_to_testcases, path_to_outputs=None): 
288 |     code_str = add_headers_cpp(code_str)
289 |     code_str = insert_io_redirects_cpp(code_str, path_to_testcases, path_to_outputs)
290 |     return code_str
291 | 
292 | 
293 | def redirect_cpp_io_file(code_path, stdin_path, stdout_path=None, new_code_dir=None): 
294 |     input_basename = os.path.splitext(os.path.basename(stdin_path))[0].replace(".", "_")
295 |     if new_code_dir is None:
296 |         new_code_dir = os.path.dirname(code_path)
297 |     if stdout_path is None:
298 |         basename = os.path.splitext(os.path.basename(code_path))[0]
299 |         stdout_path = os.path.join(new_code_dir, f"{basename}_{input_basename}.stdout")        
300 |     with open(code_path, "r") as f:
301 |         code_str = f.read()
302 |     code_str = redirect_cpp_io(code_str, stdin_path, stdout_path)
303 |     new_code_path = os.path.join(new_code_dir, f"redirected_{input_basename}_{os.path.basename(code_path)}")
304 |     with open(new_code_path, "w") as f:
305 |         f.write(code_str)
306 |     return new_code_path, stdout_path
307 | 
308 | 
309 | def redirect_cpp_io_and_compile(code_path, stdin_path, cpu_number=None, new_code_dir=None, stdout_path=None, cflags="--std=c++17 -O3"): 
310 |     new_code_path, stdout_path = redirect_cpp_io_file(code_path, stdin_path, new_code_dir, stdout_path)
311 |     new_binary_path = compile_cpp_code(new_code_path, cpu_number=cpu_number, cflags=cflags)
312 |     return new_binary_path, new_code_path, stdout_path
313 | 
314 |     
315 | ## physical / logical cpu management
316 | 
317 | def get_physical_cpu_list():
318 |     cmd = " grep -E '^processor|^physical id|^core id' /proc/cpuinfo "
319 |     output = os.popen(cmd).read()
320 |     output = output.split("processor")
321 |     output = [x for x in output if x]
322 |     physical2logical = defaultdict(list)
323 |     n_logical = 0
324 |     for cpu_info in output:
325 |         logical_id = re.search("(?<=\t: )\d+", cpu_info).group(0)
326 |         physical_id = re.search("(?<=core id\t\t: )\d+", cpu_info).group(0)
327 |         physical2logical[int(physical_id)].append(int(logical_id))
328 |         n_logical += 1
329 |     n_physical = len(physical2logical)
330 |     from pprint import pformat
331 |     logging.info(f"Physical CPU (n={n_physical}) to Logical CPU (n={n_logical}) mapping:")
332 |     logging.info(pformat(sorted(dict(physical2logical).items(), key=lambda x: int(x[0]))))
333 |     unique_logical_ids = []
334 |     for physical_id, logical_ids in physical2logical.items():
335 |         unique_logical_ids.append(logical_ids[0])
336 |     logging.info(f"The set of logical ids available for use (n={len(unique_logical_ids)}):")
337 |     logging.info(unique_logical_ids)
338 |     return unique_logical_ids
339 | 
340 | def add_logicial_cpus_to_queue(num_processes, queue):
341 |     highest_num_processes = multiprocessing.cpu_count() 
342 |     if num_processes < 0: 
343 |         num_processes = highest_num_processes
344 |     else: 
345 |         if num_processes > highest_num_processes:
346 |             raise ValueError(f"num_processes {num_processes} is greater than the highest available cpu: {highest_num_processes}.")
347 |     available_cpus = list(range(num_processes))
348 |     if len(available_cpus) > 2: 
349 |         available_cpus = available_cpus[:-2]
350 |     else: 
351 |         logging.warning(f"there are fewer than 3 logical CPUs which is not recommended")
352 |     for cpu_id in available_cpus:
353 |         queue.put(cpu_id)
354 |     logging.info(f"List of cpus to be used: {available_cpus}")
355 |     return available_cpus
356 | 
357 | def add_physical_cpus_to_queue(num_processes, queue):
358 |     available_cpus = [i for i in get_physical_cpu_list() if i >= 0]
359 |     if len(available_cpus) > 2: 
360 |         available_cpus = available_cpus[:-2]
361 |     else: 
362 |         logging.warning(f"there are fewer than 3 physical CPUs which is not recommended")
363 |     if num_processes < 0: 
364 |         num_processes = len(available_cpus)
365 |     elif len(available_cpus) < num_processes:
366 |         raise ValueError(f"Only {len(available_cpus)} available cpus, but {num_processes} processes requested; the set of available cpus is {available_cpus}")
367 |     for cpu_id in available_cpus[:num_processes]:
368 |         queue.put(cpu_id)
369 |     logging.info(f"List of cpus to be used: {available_cpus[:num_processes]}")
370 |     return available_cpus
371 | 
372 | def run_benchmark(args, json_output_path, timeout_seconds: int = 60) -> Union[str, None]:
373 |     try: 
374 |         logging.info(f"Running {' '.join(args)}")
375 |         proc = subprocess.Popen(
376 |             args, 
377 |             preexec_fn=limit_virtual_memory,
378 |             # stderr=subprocess.DEVNULL, 
379 |             # stdout=subprocess.DEVNULL
380 |         )
381 |         output = proc.communicate(timeout=timeout_seconds)[0]
382 |         if os.path.exists(json_output_path):
383 |             results = json.load(open(json_output_path)).get("results", [])
384 |             return results, output
385 |         else:
386 |             return None, output
387 |     except subprocess.TimeoutExpired: 
388 |         logging.warning(f"Timeout for {args}")
389 |         _kill(proc.pid)  # type: ignore
390 |         return None, f"Timeout after {timeout_seconds} seconds"
391 |     except json.decoder.JSONDecodeError: 
392 |         logging.warning(f"JSONDecodeError for {args}")
393 |         return None, f"JSONDecodeError"
394 |     except KeyboardInterrupt as e:
395 |         _kill(proc.pid)  # type: ignore
396 |         raise e
397 | 
398 |     
399 | def run_hyperfine(code_paths: List[str], 
400 |                    problem_ids: List[str], 
401 |                    path_to_testcases: str,
402 |                    json_out_path: str, # TODO REMOVE json_out_path
403 |                    test_cases_list: List[int] = None,
404 |                    min_runs_per_test_case: int = None, 
405 |                    max_runs_per_test_case: int = None,
406 |                    strict_runs_per_test_case: bool = False,
407 |                    warmup_runs_per_test_case: int = 5,
408 |                    cpu_number: int = None, 
409 |                    do_sanity_check: bool = False, 
410 |                    cflags: str = "--std=c++17 -O3"):
411 |     """
412 |     will benchmark all in 1 json / 1 run of hyperfine, all on the same cpu
413 |     """
414 |     
415 |     ### TODO: need to change to handle compilation errors and timeouts
416 |     
417 |     code2benchmarks = defaultdict(list)
418 |     benchmark2code = {}
419 |     code2results = defaultdict(dict)
420 |     code2testcases = defaultdict(list)
421 |     if test_cases_list is None: 
422 |         test_cases_list = [None] * len(code_paths)
423 |     for code_path, problem_id, test_case_list in zip(code_paths, problem_ids, test_cases_list):
424 |         problem_dir = os.path.join(path_to_testcases, problem_id)
425 |         testcases_paths = glob.glob(os.path.join(problem_dir, "input.*.txt"))
426 |         if test_case_list is not None:
427 |             testcases_paths = [t for t in testcases_paths if int(re.search("(?<=input\.)\d+", t).group(0)) in test_case_list]
428 |         test_case_numbers = [int(re.search("(?<=input\.)\d+", t).group(0)) for t in testcases_paths]
429 |         code2testcases[code_path] = test_case_numbers
430 |         for testcase_path in testcases_paths:
431 |             bin_redirect, code_redirect, _ = redirect_cpp_io_and_compile(code_path, 
432 |                                                                          testcase_path, 
433 |                                                                          cpu_number=cpu_number, 
434 |                                                                          cflags=cflags)
435 |             code2benchmarks[code_path].append(bin_redirect)
436 |             benchmark2code[bin_redirect] = code_path
437 |     
438 |     cmds = " ".join([bin_redirect for bin_redirects in code2benchmarks.values() for bin_redirect in bin_redirects])
439 |     n_cmds = len(cmds.split(" "))
440 |     if strict_runs_per_test_case:
441 |         assert min_runs_per_test_case is not None 
442 |         runs_str = f" --runs {min_runs_per_test_case}"
443 |     else: 
444 |         runs_str = ""
445 |         if min_runs_per_test_case is not None: 
446 |             runs_str += f" --min-runs {min_runs_per_test_case}"
447 |         if max_runs_per_test_case is not None:
448 |             runs_str += f" --max-runs {max_runs_per_test_case}"
449 |     if warmup_runs_per_test_case is not None:
450 |         runs_str += f" --warmup {warmup_runs_per_test_case}"
451 |     
452 |     cmd_benchmark = (
453 |         f"hyperfine {runs_str} -N {cmds}  --export-json {json_out_path} "
454 |     )
455 |     
456 |     if cpu_number is not None:
457 |         cmd_benchmark = f"taskset --cpu-list {cpu_number} {cmd_benchmark}"
458 |         
459 |     if do_sanity_check: 
460 |         SANITY_CHECK_TIMEOUT = 1.5 * n_cmds
461 |         cmd_sanity_check = cmd_benchmark.replace(runs_str, f" --runs 2 --warmup 1 ") 
462 |         p = subprocess.run(shlex.split(cmd_sanity_check), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=SANITY_CHECK_TIMEOUT, encoding="utf-8")
463 |         if p.returncode != 0:
464 |             return None, f"Sanity check failed for {cmd_sanity_check}: {p.stderr}"
465 |     results, output = run_benchmark(shlex.split(cmd_benchmark), json_out_path)
466 | 
467 |     for result in results: 
468 |         command = result["command"]
469 |         tc_no = int(re.search("(?<=input\_)\d+", command).group(0))
470 |         code2results[benchmark2code[command]][tc_no] = result
471 |     for bin, code in benchmark2code.items():
472 |         results = code2results[code]
473 |         missing_tcs = set(code2testcases[code]) - set(results.keys())
474 |         for tc_no in missing_tcs:
475 |             results[tc_no] = None
476 |     return code2results, output
477 |         
478 |         
479 |         
480 |     
481 | 


--------------------------------------------------------------------------------
/gem5/gem5_api.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, request, jsonify
  2 | import argparse
  3 | import json
  4 | import logging
  5 | from datetime import datetime
  6 | import os
  7 | from joblib import Parallel, delayed
  8 | import benchmarking
  9 | import tempfile
 10 | import multiprocessing
 11 | import numpy as np
 12 | import joblib
 13 | from tqdm import tqdm
 14 | import contextlib
 15 | 
 16 | LOGGING_DIR="/home/logs/"
 17 | if not os.path.exists(LOGGING_DIR): 
 18 |     os.makedirs(LOGGING_DIR)
 19 | 
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | logger.setLevel(logging.CRITICAL)
 23 | 
 24 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')
 25 | 
 26 | # Create a file handler for the log file
 27 | start_date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
 28 | file_handler = logging.FileHandler(os.path.join(LOGGING_DIR, start_date_time + "_gem5_api.log"))
 29 | file_handler.setLevel(logging.DEBUG)
 30 | file_handler.setFormatter(formatter)
 31 | 
 32 | # Create a stream handler to print the logs to stdout
 33 | stream_handler = logging.StreamHandler()
 34 | stream_handler.setLevel(logging.INFO)
 35 | stream_handler.setFormatter(formatter)
 36 | 
 37 | # Add both handlers to the logger
 38 | logger.addHandler(file_handler)
 39 | logger.addHandler(stream_handler)
 40 | 
 41 | 
 42 | app = Flask(__name__)
 43 | 
 44 | 
 45 | global MANAGER
 46 | global QUEUE
 47 | global N_CPUS
 48 | MANAGER = ...
 49 | QUEUE = ...
 50 | N_CPUS=... # Will be set in init_globals after parse_args()
 51 | 
 52 | @contextlib.contextmanager
 53 | def tqdm_joblib(tqdm_object):
 54 |     """Context manager to patch joblib to report into tqdm progress bar given as argument"""
 55 |     class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
 56 |         def __call__(self, *args, **kwargs):
 57 |             tqdm_object.update(n=self.batch_size)
 58 |             return super().__call__(*args, **kwargs)
 59 | 
 60 |     old_batch_callback = joblib.parallel.BatchCompletionCallBack
 61 |     joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
 62 |     try:
 63 |         yield tqdm_object
 64 |     finally:
 65 |         joblib.parallel.BatchCompletionCallBack = old_batch_callback
 66 |         tqdm_object.close()
 67 |         
 68 | 
 69 | def init_globals(n_workers: int = -1, use_logical_cpus: bool = False): 
 70 |     global MANAGER
 71 |     global QUEUE 
 72 |     global N_CPUS
 73 |     
 74 |     MANAGER = multiprocessing.Manager()
 75 |     QUEUE = MANAGER.Queue()
 76 |     if use_logical_cpus: 
 77 |         cpu_list = benchmarking.add_logicial_cpus_to_queue(n_workers, QUEUE)
 78 |     else: 
 79 |         cpu_list = benchmarking.add_physical_cpus_to_queue(n_workers, QUEUE)
 80 |     N_CPUS = len(cpu_list)
 81 |     print(f"Initialized globals with {N_CPUS} cpus")
 82 |     return None
 83 | 
 84 | 
 85 | def parse_args():
 86 |     parser = argparse.ArgumentParser(description='Gem5 API')
 87 |     parser.add_argument('--api_key', type=str, help='required API key on initialization for authentication')
 88 |     parser.add_argument('--port', type=int, default=706965, help='port number')
 89 |     parser.add_argument('--working_dir', type=str, default='/home/working_dir', help='working directory')
 90 |     parser.add_argument('--use_logical_cpus',  default=False, action="store_true") 
 91 |     parser.add_argument('--workers', type=int, default=-1, help='number of workers, if <0 (e.g. -1) then it uses all available physical cpus')
 92 |     parser.add_argument('--threaded',  default=False, action="store_true")
 93 |     parser.add_argument('--gem5_acc_threshold', type=float, default=0.95, help="mean threshold where if below this, we do not run gem5")
 94 |     parser.add_argument('--debug',  default=False, action="store_true")
 95 |     parser.add_argument('--exit_early_on_fail', action="store_true")
 96 |     ## gem5 and compilation parameters
 97 |     parser.add_argument('--testcases_dir', type=str, help='testcases directory', default="/home/pie-perf/data/codenet/merged_test_cases/")
 98 |     parser.add_argument('--cstd', type=str, help='cstd', default='--std=c++17')
 99 |     parser.add_argument('--optimization_flag', type=str, help='optimization', default='-O3')
100 |     parser.add_argument('--gem5_dir', type=str, help='path containing gem5 binary and build', default='/home/gem5/build/X86/')
101 |     parser.add_argument('--gem5_script_path', type=str, help='path to gem5 script', default='/home/gem5-skylake-config/gem5-configs/run-se.py')
102 |     parser.add_argument('--cpu_type', type=str, help='cpu type', default='Verbatim')
103 |     parser.add_argument('--path_to_atcoder', type=str, help='path to atcoder', default='/home/ac-library/')
104 |     parser.add_argument('--timeout_seconds_binary', type=int, help='timeout seconds for binary', default=10)
105 |     parser.add_argument('--timeout_seconds_gem5', type=int, help='timeout seconds for gem5', default=120)
106 |     
107 |     
108 |     args = parser.parse_args()
109 |     app.config.update(vars(args))
110 |     return args
111 | 
112 | def single_submission(code, testcases, problem_id, timing_env, queue, override_flags=""):
113 |     ## TODO -> check if any test cases are missing with hyperfine
114 |     logging.info(f"single_submission for problem {problem_id} with timing_env {timing_env} and testcases {testcases}")
115 |     override_flags = "" if not isinstance(override_flags, str) else override_flags
116 |     result = {}
117 |     cpu_number = queue.get(block=True) if timing_env in ("binary", "both") else None
118 |     logging.info(f"got cpu {cpu_number} in pid {os.getpid()}")
119 |     with tempfile.TemporaryDirectory() as tmpdirname:
120 |         code_path = os.path.join(tmpdirname, 'code.cpp')
121 |         with open(code_path, 'w') as f:
122 |             f.write(code)
123 |         print(f"app cfg cstd {app.config['cstd']} app.config['optimization_flag']: {app.config['optimization_flag']}  override_flags: {override_flags }")
124 |         cflags = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags
125 |         bin_path, accs = benchmarking.compile_and_check_outputs(
126 |             code_path=code_path,
127 |             problem_id=problem_id,
128 |             testcases_dir=app.config['testcases_dir'], 
129 |             timeout=app.config['timeout_seconds_binary'],
130 |             cflags=cflags, 
131 |             testcases=testcases, 
132 |             cpu_number=cpu_number)
133 |         result["compile_success"] = bin_path is not None
134 |         result['accs'] = accs
135 |         mean_accs = np.mean(list(accs.values()))
136 |         logging.info(f"mean_accs: {mean_accs}")
137 |         if mean_accs < app.config["gem5_acc_threshold"]: 
138 |             logging.info(f"mean_accs: {mean_accs} is below threshold {app.config['gem5_acc_threshold']}, skipping gem5")
139 |             if timing_env in ["gem5", "both"]:
140 |                 result["gem5"] = {} # return empty dict
141 |             if timing_env in ["binary", "both"]:
142 |                 result["binary"] = {} # return empty dict
143 |             return result
144 |         
145 |         if timing_env in ['gem5', 'both']: 
146 |             logging.info(f"running gem5 for problem {problem_id}")
147 |             gem5_results = benchmarking.run_gem5(
148 |                 gem5_dir=app.config['gem5_dir'],
149 |                 gem5_script_path=app.config['gem5_script_path'],
150 |                 cpu_type=app.config['cpu_type'],
151 |                 bin_path=bin_path,
152 |                 problem_id=problem_id,
153 |                 testcases_dir=app.config['testcases_dir'],
154 |                 timeout=app.config['timeout_seconds_gem5'],
155 |                 testcases=testcases,
156 |                 cpu_number=cpu_number, 
157 |                 exit_early_on_fail=app.config['exit_early_on_fail'])
158 |             result['gem5'] = gem5_results
159 |         if timing_env in ['binary', 'both']:
160 |             code2results, output = benchmarking.run_hyperfine(
161 |                 code_paths=[code_path],
162 |                 problem_ids=[problem_id],
163 |                 path_to_testcases=app.config['testcases_dir'],
164 |                 # TODO: REMOVE THIS HERE
165 |                 json_out_path=os.path.join(tmpdirname, 'hyperfine_results.json'),
166 |                 test_cases_list=[testcases],
167 |                 min_runs_per_test_case=10,
168 |                 max_runs_per_test_case=500,
169 |                 warmup_runs_per_test_case=5,
170 |                 cpu_number=cpu_number, 
171 |                 do_sanity_check=True) # TODO: PIN TO CPU
172 |             binary_results = code2results[code_path]
173 |             result["binary"] = binary_results
174 |     queue.put(cpu_number)
175 |     return result
176 | 
177 | 
178 | def dual_submission(code_v0, code_v1, testcases, problem_id, timing_env, queue, override_flags_v0="", override_flags_v1=""):
179 |     override_flags_v0 = "" if not isinstance(override_flags_v0, str) else override_flags_v0
180 |     override_flags_v1 = "" if not isinstance(override_flags_v1, str) else override_flags_v1
181 |     result = {}
182 |     cpu_number = queue.get(block=True)
183 |     with tempfile.TemporaryDirectory() as tmpdirname_v0, tempfile.TemporaryDirectory() as tmpdirname_v1:
184 |         code_path_v0 = os.path.join(tmpdirname_v0, 'code.cpp')
185 |         with open(code_path_v0, 'w') as f:
186 |             f.write(code_v0)
187 |         code_path_v1 = os.path.join(tmpdirname_v1, 'code.cpp')
188 |         with open(code_path_v1, 'w') as f:
189 |             f.write(code_v1)
190 | 
191 |         print(f"app cfg cstd {app.config['cstd']} app.config['optimization_flag']: {app.config['optimization_flag']}  override_flags_v0: {override_flags_v0 }")
192 |         cflags_v0 = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags_v0 
193 |         cflags_v1 = app.config['cstd'] + ' ' + app.config['optimization_flag'] + override_flags_v1
194 |         
195 |         bin_path_v0, accs_v0 = benchmarking.compile_and_check_outputs(
196 |             code_path=code_path_v0,
197 |             problem_id=problem_id,
198 |             testcases_dir=app.config['testcases_dir'], 
199 |             timeout=app.config['timeout_seconds_binary'],
200 |             cflags=cflags_v0, 
201 |             testcases=testcases, 
202 |             cpu_number=cpu_number)
203 |         bin_path_v1, accs_v1 = benchmarking.compile_and_check_outputs(
204 |             code_path=code_path_v1,
205 |             problem_id=problem_id,
206 |             testcases_dir=app.config['testcases_dir'], 
207 |             timeout=app.config['timeout_seconds_binary'],
208 |             cflags=cflags_v1, 
209 |             testcases=testcases, 
210 |             cpu_number=cpu_number)
211 |         result["compile_success_v0"] = bin_path_v0 is not None
212 |         result["compile_success_v1"] = bin_path_v1 is not None
213 |         result['accs_v0'] = accs_v0
214 |         result['accs_v1'] = accs_v1
215 |         if timing_env in ['gem5', 'both']:
216 |             gem5_results_v0 = benchmarking.run_gem5(
217 |                 gem5_dir=app.config['gem5_dir'],
218 |                 gem5_script_path=app.config['gem5_script_path'],
219 |                 cpu_type=app.config['cpu_type'],
220 |                 bin_path=bin_path_v0,
221 |                 problem_id=problem_id,
222 |                 testcases_dir=app.config['testcases_dir'],
223 |                 timeout=app.config['timeout_seconds_gem5'],
224 |                 testcases=testcases,
225 |                 cpu_number=cpu_number, 
226 |                 exit_early_on_fail=app.config['exit_early_on_fail'])
227 |             result['gem5_v0'] = gem5_results_v0
228 |             gem5_results_v1 = benchmarking.run_gem5(
229 |                 gem5_dir=app.config['gem5_dir'],
230 |                 gem5_script_path=app.config['gem5_script_path'],
231 |                 cpu_type=app.config['cpu_type'],
232 |                 bin_path=bin_path_v1,
233 |                 problem_id=problem_id,
234 |                 testcases_dir=app.config['testcases_dir'],
235 |                 timeout=app.config['timeout_seconds_gem5'],
236 |                 testcases=testcases,
237 |                 cpu_number=cpu_number, 
238 |                 exit_early_on_fail=app.config['exit_early_on_fail'])
239 |             result['gem5_v1'] = gem5_results_v1
240 |         if timing_env in ['binary', 'both']:
241 |             code2results, output = benchmarking.run_hyperfine(
242 |                 code_paths=[code_path_v0, code_path_v1],
243 |                 problem_ids=[problem_id, problem_id],
244 |                 path_to_testcases=app.config['testcases_dir'],
245 |                 json_out_path=os.path.join(tmpdirname_v0, 'hyperfine_results.json'),
246 |                 test_cases_list=[testcases, testcases],
247 |                 min_runs_per_test_case=10,
248 |                 max_runs_per_test_case=500,
249 |                 warmup_runs_per_test_case=5,
250 |                 cpu_number=cpu_number, 
251 |                 do_sanity_check=True)
252 |             result["binary_v0"] = code2results[code_path_v0]
253 |             result["binary_v1"] = code2results[code_path_v1]
254 |     queue.put(cpu_number)
255 |     return result
256 | 
257 | 
258 | def multiple_single_submissions(code_list, testcases_list, problem_id_list, timing_env, queue, cpus, override_flags_list=None):
259 |     assert len(code_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list)
260 |     with tqdm_joblib(tqdm(desc="Running multiple single submissions", total=len(code_list))) as progress_bar:
261 |         results = Parallel(n_jobs=cpus, verbose=10, backend="multiprocessing")(delayed(single_submission)(code, testcases, problem_id, timing_env, queue, override_flags) for code, testcases, problem_id, override_flags in zip(code_list, testcases_list, problem_id_list, override_flags_list))
262 |     return results
263 | 
264 | def multiple_dual_submissions(code_v0_list, code_v1_list, testcases_list, problem_id_list, timing_env, queue, cpus, override_flags_list_v0, override_flags_list_v1):
265 |     assert len(code_v0_list) == len(code_v1_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list_v0) == len(override_flags_list_v1)
266 |     results = Parallel(n_jobs=cpus, verbose=10, backend="multiprocessing")(delayed(dual_submission)(code_v0, code_v1, testcases, problem_id, timing_env, queue, override_flags_v0, override_flags_v1) for code_v0, code_v1, testcases, problem_id, override_flags_v0, override_flags_v1 in zip(code_v0_list, code_v1_list, testcases_list, problem_id_list, override_flags_list_v0, override_flags_list_v1))
267 |     return results
268 | 
269 |     
270 | @app.route('/gem5/single_submission', methods=['GET'])
271 | def SingleSubmission(): 
272 |     req = request.get_json()
273 |     if req["api_key"] != app.config["api_key"]:
274 |         return jsonify({"error": "Invalid API key"})
275 |     code = req['code']
276 |     testcases = req['testcases']
277 |     problem_id = req['problem_id']
278 |     timing_env = req['timing_env']
279 |     assert len(testcases) > 0
280 |     assert len(code) > 0
281 |     assert timing_env in ['gem5', 'binary', 'both']
282 |     
283 |     override_flags = req.get('override_flags', "")
284 |     results = single_submission(code, testcases, problem_id, timing_env, QUEUE, override_flags)
285 |     return jsonify(results)
286 | 
287 | @app.route('/gem5/multiple_single_submissions', methods=['GET'])
288 | def MultipleSubmissions():
289 |     req = request.get_json()
290 |     if req["api_key"] != app.config["api_key"]:
291 |         return jsonify({"error": "Invalid API key"})
292 |     submissions = req['submissions']
293 |     timing_env = req['timing_env']
294 |     code_list = [r['code'] for r in submissions]
295 |     testcases_list = [r['testcases'] for r in submissions]
296 |     problem_id_list = [r['problem_id'] for r in submissions]
297 |     override_flags_list = [r.get('override_flags_list', "") for r in submissions]
298 |     
299 |     assert len(code_list) == len(testcases_list) == len(problem_id_list) == len(override_flags_list)
300 |     assert timing_env in ['gem5', 'binary', 'both']
301 |     assert len(code_list) > 0
302 |     assert len(testcases_list) > 0
303 |     assert len(problem_id_list) > 0
304 |     assert len(override_flags_list) > 0
305 |     assert all([len(code) > 0 for code in code_list])
306 |     assert all([len(testcases) > 0 for testcases in testcases_list])
307 |     
308 |     results = multiple_single_submissions(code_list, testcases_list, problem_id_list, timing_env, QUEUE, N_CPUS, override_flags_list)
309 |     
310 |     return jsonify(results)
311 | 
312 | @app.route('/gem5/single_submission_pair', methods=['GET'])
313 | def SingleSubmissionPair():
314 |     req = request.get_json()
315 |     if req["api_key"] != app.config["api_key"]:
316 |         return jsonify({"error": "Invalid API key"})
317 |     #assert len(req) == 2
318 |     code_v0 = req['code_v0']
319 |     code_v1 = req['code_v1']
320 |     testcases = req['testcases']
321 |     problem_id = req['problem_id']
322 |     timing_env = req['timing_env']
323 |     assert len(testcases) > 0
324 |     assert len(code_v0) > 0
325 |     assert len(code_v1) > 0
326 |     assert timing_env in ['gem5', 'binary', 'both']
327 |     
328 |     override_flags = req.get('override_flags', "")
329 |     results = dual_submission(code_v0, code_v1, testcases, problem_id, timing_env, QUEUE, override_flags)
330 |     return jsonify(results)
331 | 
332 | @app.route('/gem5/multiple_submissions_pairs', methods=['GET'])
333 | def MultipleSubmissionsPair():
334 |     req = request.get_json()
335 |     if req["api_key"] != app.config["api_key"]:
336 |         return jsonify({"error": "Invalid API key"})
337 |     submissions_v0 = req['submissions_v0']
338 |     submissions_v1 = req['submissions_v1']
339 |     timing_env = req['timing_env']
340 |     
341 |     code_list_v0 = [r['code'] for r in submissions_v0]
342 |     code_list_v1 = [r['code'] for r in submissions_v1]
343 |     testcases_list = [r['testcases'] for r in submissions_v0]
344 |     problem_id_list = [r['problem_id'] for r in submissions_v0]
345 |     
346 |     override_flags_list_v0 = [r.get('override_flags_list', "") for r in submissions_v0]
347 |     override_flags_list_v1 = [r.get('override_flags_list', "") for r in submissions_v1]
348 |     
349 |     assert len(code_list_v0) == len(testcases_list) == len(problem_id_list) == len(override_flags_list_v0) == len(code_list_v1) == len(override_flags_list_v1)
350 |     assert timing_env in ['gem5', 'binary', 'both']
351 |     assert len(code_list_v0) > 0
352 |     assert len(testcases_list) > 0
353 |     assert len(problem_id_list) > 0
354 |     assert all([len(code) > 0 for code in code_list_v0])
355 |     assert all([len(code) > 0 for code in code_list_v1])
356 |     assert all([len(testcases) > 0 for testcases in testcases_list])
357 |     
358 |     results = multiple_dual_submissions(code_list_v0, code_list_v1, testcases_list, problem_id_list, timing_env, QUEUE, N_CPUS, override_flags_list_v0, override_flags_list_v1)
359 |     return jsonify(results)
360 | 
361 | @app.route('/gem5/ping', methods=['GET'])
362 | def Ping():
363 |     return jsonify({"status": "ok"})
364 | 
365 | 
366 | if __name__ == '__main__':
367 |     args = parse_args()
368 |     init_globals(args.workers, args.use_logical_cpus)
369 |     app.run(host="0.0.0.0", port=args.port, debug=args.debug)
370 |     
371 |     
372 |     
373 |     
374 | 
375 |     
376 |     
377 | 


--------------------------------------------------------------------------------
/gem5/gem5_eval.py:
--------------------------------------------------------------------------------
  1 | # from src.codenet_eval.run_eval import (read_ground_truths, read_inputs_and_prepare)
  2 | # from src.codenet_eval.evalconfig import EvaluationConfig
  3 | import tarfile                   
  4 | import shutil        
  5 | import tempfile
  6 | import logging 
  7 | import pandas as pd
  8 | import json
  9 | import os 
 10 | import pdb
 11 | import argparse
 12 | from gem5.simulator import PieEnvironment
 13 | from gem5 import simulator
 14 | import traceback
 15 | import pdb
 16 | import threading
 17 | from tqdm import tqdm
 18 | import re
 19 | from typing import Optional, Any
 20 | import yaml
 21 | from dataclasses import dataclass, field
 22 | import ast
 23 | 
 24 | logging.basicConfig(level=logging.INFO)
 25 | 
 26 | import signal
 27 | import time
 28 | 
 29 | KEY_COLS = ["n_tests", 
 30 |             "problem_id", 
 31 |             "tests"
 32 |             "src_id", 
 33 |             "tgt_id", 
 34 |             "fastest_runtime", "fastest_accuracy"]
 35 | 
 36 | 
 37 | def get_key_columns(df, cfg):
 38 |     ## in key columns or if 
 39 |     ## *_test_compilation, *_test_accuracy, *_test_agg_runtime, *_tc2time
 40 |     key_cols = [c for c in df.columns if c in KEY_COLS or c.endswith("_compilation") or c.endswith("_accuracy") or c.endswith("_runtime") or c.endswith("_tc2time")]
 41 |     key_cols += [c for c in df.columns if cfg.model_generated_potentially_faster_code_col in c] + [cfg.slow_code_col, cfg.reference_code_col]
 42 |     key_cols = list(set(key_cols))
 43 |     return df[key_cols]
 44 | 
 45 | def _fix_value(x: Any) -> Any:
 46 |     ## if starts with '[' and ends with ']', as a string, then convert to list
 47 |     if isinstance(x, str) and len(x) > 1 and x[0] == '[' and x[-1] == ']':
 48 |         x = ast.literal_eval(x)
 49 |     return x
 50 | 
 51 | def fix_df_columns(df):
 52 |     for col in df.columns:
 53 |         df[col] = df[col].apply(lambda x: _fix_value(x))
 54 |     return df
 55 |     
 56 | 
 57 | 
 58 | def unmelt_results(results_df, cfg, remove_extra_cols=False):
 59 |         unmelted_data = []
 60 |         for src_id, group in results_df.groupby("src_id"):
 61 |             src_code_row = group[group["code_type"] == "src_code"].iloc[0]
 62 |             new_row = src_code_row.to_dict()
 63 |             for index, row in group.iterrows():
 64 |                 new_row["src_id"] = src_id
 65 |                 new_row[f'{row["code_type"]}_compilation'] = row["compilation"]
 66 |                 new_row[f'{row["code_type"]}'] = row["code"]
 67 |                 if row["code_type"].startswith(cfg.model_generated_potentially_faster_code_col) or cfg.redo_src_tgt:
 68 |                     new_row[f'{row["code_type"]}_accuracy'] = row["accuracy"]
 69 |                     new_row[f'{row["code_type"]}_agg_runtime'] = row["agg_runtime"]
 70 |                     new_row[f'{row["code_type"]}_tc2time'] = row["tc2time"]
 71 |             unmelted_data.append(new_row)
 72 |         ## clean up the column names
 73 |         unmelted_df = pd.DataFrame(unmelted_data)
 74 |         if remove_extra_cols:
 75 |             unmelted_df = get_key_columns(unmelted_df, cfg)
 76 |         
 77 |         # unmelted_df = rename_columns(unmelted_df)
 78 |         
 79 |         return unmelted_df
 80 |         
 81 | def report_results(df, cfg, orig_df): 
 82 |         ## all columns will be cfg.model_generated_potentially_faster_code_col_*
 83 |         ## for these, consider only use those that are not None, above threshold_accuracy, and have the fastest_runtime
 84 |         ## for those, keep the runtime, but if the accuracy is below threshold_accuracy, set the runtime to float("inf")
 85 |         
 86 |         ## then consider only max_generations_to_report
 87 |         
 88 |         ## in 1, 2, 4... (powers of 2 up until len(runtimes)), report the best runtime
 89 |         ## as runtime_best@1, runtime_best@2, runtime_best@4, etc. accuracy_best@1, accuracy_best@2, accuracy_best@4, etc.
 90 |         ## while also reporting speedup_best@1, speedup_best@2, speedup_best@4, etc. where speedup = runtime_src / runtime_best@n 
 91 |         
 92 |         
 93 |         ## then aggregate 
 94 |         ### 1. for each 1, 2.. (powers of 2 up until len(runtimes)), report mean_accuracy@n, mean_speedup@n where we also take speedup = min(1.0, runtime_src / runtime_best@n)
 95 |         ### 2. for each 1, 2.. (powers of 2 up until len(runtimes)), report the % of programs where the speedup is >= 1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0
 96 |     
 97 |         # merged[f"{cfg.model_generated_potentially_faster_code_col}_{i}"] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x[i] if i < len(x) else None)
 98 |         import pdb
 99 |         # pdb.set_trace()
100 |         # print("columns before report_results")
101 |         # print(df.columns)
102 |         
103 |         
104 |         # num_generated_cols = len([c for c in df.columns if re.match(f"{cfg.model_generated_potentially_faster_code_col}_[0-9]+", c) or c == cfg.model_generated_potentially_faster_code_col])
105 |         num_generated_cols = cfg.num_generated_cols
106 |         assert num_generated_cols is not None, f"num_generated_cols is None, it should have been set in read_inputs_and_prepare_v2"
107 |         
108 |         import pandas as pd
109 |         import numpy as np
110 | 
111 |         # Assuming orig_df and df are already defined, and cfg and num_generated_cols are given
112 | 
113 |         # Step 1: Find rows in orig_df that are not in df
114 |         # do this with src_code not src_id 
115 |         print(f"length of orig_df {len(orig_df)} vs length of results_df {len(df)}")
116 |         orig_df["src_tgt_code"] = orig_df[cfg.slow_code_col] + orig_df[cfg.reference_code_col]
117 |         df["src_tgt_code"] = df[cfg.slow_code_col] + df[cfg.reference_code_col]
118 |         # drop duplicates from both 
119 |         df = df.drop_duplicates(subset=["src_tgt_code"])
120 |         orig_df = orig_df.drop_duplicates(subset=["src_tgt_code"])
121 |         unique_rows = orig_df[~orig_df['src_tgt_code'].isin(df['src_tgt_code'])]
122 |         assert len(unique_rows) == (len(orig_df) - len(df)), f"len(unique_rows) {len(unique_rows)} == len(orig_df) - len(df) {len(orig_df) - len(df)}"
123 | 
124 |         # Step 2: Create additional columns for the unique rows and set default values
125 |         for j in range(num_generated_cols + 1):  # Adding 1 to include the case when j == num_generated_cols
126 |             colname = f"{cfg.model_generated_potentially_faster_code_col}_{j}" if num_generated_cols > 0 else cfg.model_generated_potentially_faster_code_col
127 |             unique_rows[f"{colname}_agg_runtime"] = float("inf")  # Setting runtime to inf
128 |             unique_rows[f"{colname}_accuracy"] = 0  # Setting accuracy to 0
129 |             unique_rows[f"{colname}_tc2time"] = [{} for _ in range(len(unique_rows))]  # Setting tc2time to {}
130 |         # drop unique rows columns that are not in df
131 |         unique_rows = unique_rows[[c for c in unique_rows.columns if c in df.columns]]
132 | 
133 |         # Step 3: Append the modified unique rows to df
134 |         df = pd.concat([df, unique_rows], ignore_index=True)
135 | 
136 |         print(f"columns after appending {df.columns}")
137 |         print(f"unique rows columns {unique_rows.columns}")
138 |         assert len(df) == 978, f"len(df) {len(df)} == 978"
139 |         
140 |         new_rows = []
141 |         for i, row in df.iterrows():
142 |             for j in range(num_generated_cols):
143 |                 colname = f"{cfg.model_generated_potentially_faster_code_col}_{j}" if num_generated_cols > 0 else cfg.model_generated_potentially_faster_code_col
144 |                 if row[colname] is None or pd.isna(row[colname]) or pd.isnull(row[colname]):
145 |                     row[f"{colname}_agg_runtime_adjusted"] = float("inf")
146 |                 if row[f"{colname}_accuracy"] < cfg.threshold_accuracy:
147 |                     row[f"{colname}_agg_runtime_adjusted"] = float("inf")
148 |                 else: 
149 |                     row[f"{colname}_agg_runtime_adjusted"] = row[f"{colname}_agg_runtime"]
150 |             row["fastest_generated_agg_runtime"] = min([row[f"{cfg.model_generated_potentially_faster_code_col}_{j}_agg_runtime_adjusted"] for j in range(num_generated_cols)])
151 |             new_rows.append(row)
152 |             
153 |         df = pd.DataFrame(new_rows)
154 |         
155 |         problem_id_to_fastest_agg_runtime = {}
156 |         problem_id_to_fastest_correctness = {}
157 |         for i, group in df.groupby("problem_id"):
158 |             problem_id_to_fastest_agg_runtime[i] = group["fastest_generated_agg_runtime"].min()
159 |             problem_id_to_fastest_correctness[i] = problem_id_to_fastest_agg_runtime[i] < float("inf")
160 |             
161 |         df["fastest_generated_runtime_over_all_submissions"] = df["problem_id"].apply(lambda x: problem_id_to_fastest_agg_runtime[x])
162 |         df["fastest_generated_speedup_over_all_submissions"] = df[cfg.slow_code_col+"_agg_runtime"] / df["fastest_generated_runtime_over_all_submissions"]
163 |         df["fastest_generated_speedup_over_all_submissions"] = df["fastest_generated_speedup_over_all_submissions"].apply(lambda x: max(1.0, x))
164 |         df["fastest_generated_correctness_over_all_submissions"] = df["problem_id"].apply(lambda x: problem_id_to_fastest_correctness[x])
165 |         
166 |         
167 |         for i in range(1, num_generated_cols+1):
168 |             if num_generated_cols == 0:
169 |                 df[f"agg_runtime_best@{i}"] = df[f"{cfg.model_generated_potentially_faster_code_col}_agg_runtime_adjusted"]
170 |                 df[f"accuracy_best@{i}"] = df[f"{cfg.model_generated_potentially_faster_code_col}_accuracy"]
171 |                 df[f"is_correct_best@{i}"] = df[f"accuracy_best@{i}"] == cfg.threshold_accuracy
172 |             else:
173 |                 df[f"agg_runtime_best@{i}"] = df[[f"{cfg.model_generated_potentially_faster_code_col}_{j}_agg_runtime_adjusted" for j in range(i)]].min(axis=1)
174 |                 df[f"accuracy_best@{i}"] = df[[f"{cfg.model_generated_potentially_faster_code_col}_{j}_accuracy" for j in range(i)]].max(axis=1)
175 |                 df[f"is_correct_best@{i}"] = df[f"accuracy_best@{i}"] == cfg.threshold_accuracy
176 |             df[f"speedup_best@{i}"] = df[cfg.slow_code_col+"_agg_runtime"] / df[f"agg_runtime_best@{i}"]
177 |             df[f"speedup_best@{i}"] = df[f"speedup_best@{i}"].apply(lambda x: max(1.0, x))
178 |             df["speedup_of_fastest_generated_of_all_submissions"] = df[cfg.slow_code_col+"_agg_runtime"] / df["fastest_generated_runtime_over_all_submissions"]
179 |             df["speedup_of_fastest_generated_of_all_submissions"] = df["speedup_of_fastest_generated_of_all_submissions"].apply(lambda x: max(1.0, x))
180 |         
181 |         ## aggregate over all rows
182 |         agg_df = pd.DataFrame(index=[0])
183 |         # agg_df["fastest_generated_runtime_over_all_submissions"] = df["fastest_generated_runtime_over_all_submissions"].mean()
184 |         agg_df["fastest_generated_correctness_over_all_submissions"] = df["fastest_generated_correctness_over_all_submissions"].mean()
185 |         agg_df["fastest_generated_speedup_over_all_submissions"] = df["fastest_generated_speedup_over_all_submissions"].mean()
186 |         # import pdb
187 |         for i in range(1, num_generated_cols+1):
188 |             # pdb.set_trace()
189 |             agg_df[f"mean_accuracy_best@{i}"] = df[f"accuracy_best@{i}"].mean()
190 |             agg_df[f"is_correct_best@{i}"] = df[f"is_correct_best@{i}"].mean()
191 |             agg_df[f"mean_speedup_best@{i}"] = df[f"speedup_best@{i}"].mean()
192 |             for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
193 |                 agg_df[f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}"] = (df[f"speedup_best@{i}"] >= speedup_threshold).mean()
194 |                 
195 |         ## add the speedup of tgt_code over src_code and the threshold speedups of tgt_code over src_code
196 |         df["speedup_tgt_over_src"] = df[cfg.slow_code_col+"_agg_runtime"] / df[cfg.reference_code_col+"_agg_runtime"]
197 |         agg_df["mean_speedup_tgt_over_src"] = df["speedup_tgt_over_src"].mean()
198 |         for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
199 |             agg_df[f"percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}"] = (df["speedup_tgt_over_src"] >= speedup_threshold).mean()
200 |             agg_df[f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}"] = (df["speedup_of_fastest_generated_of_all_submissions"] >= speedup_threshold).mean()
201 |         
202 |         ## pretty print out a report 
203 |         
204 |         ## first print out the columns with asterisks separating fields *********
205 |         print("********* Aggregated Results *********")
206 |         for i in range(1, num_generated_cols+1):
207 |             print(f"********* Results Best at {i} Generations *********")
208 |             mean_accuracy = agg_df[f"mean_accuracy_best@{i}"][0]
209 |             mean_speedup = agg_df[f"mean_speedup_best@{i}"][0]
210 |             
211 |             print(f"mean_accuracy_best@{i}: {mean_accuracy}")
212 |             print(f"mean correctness best@{i}: {agg_df[f'is_correct_best@{i}'][0]}")
213 |             print(f"mean_speedup_best@{i}: {mean_speedup} vs. mean_speedup_tgt_over_src: {agg_df['mean_speedup_tgt_over_src'][0]}")
214 |             for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
215 |                 percent_programs = agg_df[f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}"][0]
216 |                 percent_programs_tgt_over_src = agg_df[f"percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}"][0]
217 |                 print(f"percent_programs_speedup_best@{i}>=speedup_threshold_{speedup_threshold}: {percent_programs} vs. percent_programs_speedup_tgt_over_src>=speedup_threshold_{speedup_threshold}: {percent_programs_tgt_over_src}")
218 |             print("*****************************************")
219 |         print("********* Results Fastest Generated Over All Submissions *********")
220 |         print("mean correctness fastest_generated_over_all_submissions: ", agg_df["fastest_generated_correctness_over_all_submissions"][0])
221 |         print("average fastest_generated_speedup_over_all_submissions: ", agg_df["fastest_generated_speedup_over_all_submissions"][0])
222 |         for speedup_threshold in [1.10, 1.25, 1.50, 1.75, 2.0, 2.5, 3.0, 4.0, 5.0, 10.0]:
223 |             percent_programs = agg_df[f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}"][0]
224 |             print(f"percent_programs_speedup_fastest_generated_over_src>=speedup_threshold_{speedup_threshold}: {percent_programs}")
225 |         print("********* End Aggregated Results *********")
226 |             
227 |         return agg_df, df
228 | 
229 | # global env #: PieEnvironment
230 | global env
231 | env = None
232 | 
233 | def sigint_handler(signum, frame):
234 |     global env
235 |     print("Ctrl-C pressed, running teardown...")
236 |     if threading.current_thread().name == "MainThread":
237 |         env.teardown()
238 |     print("Teardown complete, exiting...")
239 |     exit(0)
240 | 
241 | # Set the signal handler for Ctrl+C (SIGINT)
242 | signal.signal(signal.SIGINT, sigint_handler)
243 | 
244 | 
245 | 
246 | def read_inputs_and_prepare_v2(cfg) -> pd.DataFrame:
247 |     """Reads the model generated output, the reference, joins them, and returns a dataframe with the merged data."""
248 |     logging.info(f"Reading reference file from {cfg.reference_file_path}")
249 |     logging.info(f"Reading model generated outputs from {cfg.model_generated_outputs_path}")
250 | 
251 |     
252 |     gen_df = pd.read_json(
253 |         cfg.model_generated_outputs_path, lines=True, orient="records"
254 |     )
255 |     gen_df = fix_df_columns(gen_df)
256 |     
257 |     logging.info(f"Read {len(gen_df)} rows from {cfg.model_generated_outputs_path}")
258 |     if cfg.is_prompt_based:
259 |         gen_df["slower_program"] = gen_df.apply(
260 |             lambda x: get_input_from_prompt(x), axis=1
261 |         )
262 |     else:
263 |         gen_df["slower_program"] = gen_df[cfg.slow_code_col].apply(lambda x: x.strip())
264 |         
265 |         
266 |     assert (
267 |         cfg.reference_code_col in gen_df.columns
268 |     ), f"Column {cfg.reference_code_col} not found in {cfg.model_generated_outputs_path}"
269 |     merged = gen_df
270 |     
271 |         
272 |     merged = merged[merged[cfg.slow_code_col] != merged[cfg.reference_code_col]]
273 | 
274 |     assert (
275 |         len(merged) > 0
276 |     ), f"{cfg.slow_code_col} and {cfg.reference_code_col} are the same for all programs"
277 |     
278 |     if cfg.num_problems_to_evaluate != -1:
279 |         merged = merged[: cfg.num_problems_to_evaluate]
280 |     
281 |     
282 |     # if the generated code is a list, then we have multiple generations per input. 
283 |     # we add one column per generation
284 |     if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], list) or isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], pd.Series) or (merged[cfg.model_generated_potentially_faster_code_col].iloc[0][0] == '[' and merged[cfg.model_generated_potentially_faster_code_col].iloc[0][-1] == ']'):
285 |         
286 |         if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], str):
287 |             import ast
288 |             merged[cfg.model_generated_potentially_faster_code_col] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: ast.literal_eval(x))
289 |         if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], pd.Series):
290 |             merged[cfg.model_generated_potentially_faster_code_col] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x.tolist())
291 |         num_generations = max(merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: len(x)).tolist())
292 |         
293 |         for i in range(num_generations):
294 |             merged[f"{cfg.model_generated_potentially_faster_code_col}_{i}"] = merged[cfg.model_generated_potentially_faster_code_col].apply(lambda x: x[i] if i < len(x) else None)
295 |             # so merged will have the same number of columns for all rows, but some rows will have None in some columns (because they have fewer generations)
296 |     else: 
297 |         num_generations = 1
298 |             
299 |     cfg.num_generated_cols = num_generations
300 |     
301 |     return merged
302 | 
303 |                                        
304 |                                        
305 | def main(cfg):
306 |     # Step 0
307 |     merged = read_inputs_and_prepare_v2(cfg)
308 |     reference_df = pd.read_json(cfg.reference_file_path, lines=True, orient="records")
309 |     
310 |     logging.info(f"Number of programs to evaluate: {len(merged)}")
311 |     logging.info(f"Input column: {cfg.slow_code_col}")
312 |     logging.info(f"Reference column: {cfg.reference_code_col}")
313 |     logging.info(f"Model generated column: {cfg.model_generated_potentially_faster_code_col}")
314 | 
315 |     # Step 1: Read the inputs 
316 | 
317 |     # problem_id_to_ground_truths = read_ground_truths(cfg, merged)
318 |     
319 |     # Step 2: Write the inputs to a temporary directory
320 |     
321 |     tempdir = tempfile.TemporaryDirectory()
322 | 
323 |     ## we need to melt the dataframe from [slow, fast, generated_i] -> column of code_type and column of code
324 |     generated_cols = []
325 |     if isinstance(merged[cfg.model_generated_potentially_faster_code_col].iloc[0], list):
326 |         generated_cols = [colname for colname in merged.columns if colname.startswith(cfg.model_generated_potentially_faster_code_col) and colname[-1].isdigit()]
327 |     else: 
328 |         generated_cols = [cfg.model_generated_potentially_faster_code_col]
329 |     
330 |     logging.info(f"Generated columns: {generated_cols}")
331 |     code_cols = [cfg.slow_code_col, cfg.reference_code_col] + generated_cols
332 |     
333 |     ##PATCH 
334 |     ## rename src_agg_runtime -> src_code_agg_runtime and tgt_agg_runtime -> tgt_code_agg_runtime
335 |     if "src_agg_runtime" in merged.columns and "tgt_agg_runtime" in merged.columns:
336 |         merged = merged.rename(columns={"src_agg_runtime": cfg.slow_code_col+"_agg_runtime", "tgt_agg_runtime": cfg.reference_code_col+"_agg_runtime"})
337 |     
338 |     melted = pd.melt(merged, 
339 |                      value_vars=code_cols,
340 |                      var_name="code_type",
341 |                      value_name="code", 
342 |                      id_vars = [c for c in merged.columns if c not in code_cols])
343 |     
344 |     orig_len = len(melted)
345 |     #drop code na/null
346 |     melted = melted.dropna(subset=["code"])
347 |     
348 |     # sort by "n_tests"
349 |     melted = melted.sort_values(by=["n_tests"], ascending=False)
350 |     
351 |     if not os.path.exists(os.path.join(cfg.output_dir, "test_results.jsonl")):
352 |         # drop any rows where the code length is 0
353 |         melted = melted[melted["code"].apply(lambda x: len(x) > 0)]
354 |         logging.info(f"Dropped {orig_len - len(melted)} rows with NA or empty code")
355 |         
356 |         if not cfg.redo_src_tgt:
357 |             ## remove and cache the rows where code_type == "src_code" or "tgt_code"
358 |             src_tgt_rows = melted[(melted["code_type"] == f"{cfg.slow_code_col}") | (melted["code_type"] == f"{cfg.reference_code_col}")]
359 |             melted = melted[(melted["code_type"] != f"{cfg.slow_code_col}") & (melted["code_type"] != f"{cfg.reference_code_col}")]
360 |             # pdb.set_trace()
361 |         else: 
362 |             ## if we're re-running the src_code and tgt_code, then cache the old agg_runtimes
363 |             orig_src_colname = cfg.slow_code_col.replace("_code", "_agg_runtime")
364 |             orig_tgt_colname = cfg.reference_code_col.replace("_code", "_agg_runtime")
365 |             new_src_colname = cfg.slow_code_col.replace("_code", "_original_agg_runtime")
366 |             new_tgt_colname = cfg.reference_code_col.replace("_code", "_original_agg_runtime")
367 |             melted.rename(columns={orig_src_colname: new_src_colname, orig_tgt_colname: new_tgt_colname}, inplace=True)
368 |         
369 |         print(f"Number of programs to evaluate after dropping NA: {len(melted)}")
370 |         try: 
371 |             if not os.path.exists(cfg.output_dir):
372 |                 os.makedirs(cfg.output_dir)
373 |             global env
374 |             env = simulator.make(timeout_seconds_gem5=120, verbose=True, use_logical_cpus=True, port=8888, workers=40, exit_early_on_fail=True)
375 |             ## iterate in batches of cpus_available, env.submit_mutliple_single_submissions() will submit the batch at once
376 |             new_rows = []
377 |             pbar = tqdm(total=len(melted), desc=f"Submitting {len(melted)} programs to evaluate", smoothing=0)
378 |             if cfg.cpus_available == -1: 
379 |                 cfg.cpus_available = len(melted)
380 |             # legacy - we used to submit in batches
381 |             batch = melted
382 |             # currently sorting the list of tests in reverse order of length, so that the (potentially) longest tests are run first
383 |             # this will may give more "conservative" estimates of the runtime with tqdm
384 |             results = env.submit_multiple_single_submissions(batch["code"].tolist(),
385 |                                                                 [sorted(list(t), reverse=True) for t in batch["tests"].tolist()],
386 |                                                                 batch["problem_id"].tolist(),
387 |                                                                 "gem5")
388 |             
389 |             # zip the rows and results together
390 |             for (i, row), result in zip(batch.iterrows(), results):
391 |                 row["compilation"] = result.compilation
392 |                 row["accuracy"] = result.mean_acc
393 |                 row["agg_runtime"] = result.agg_runtime
394 |                 row["tc2time"] = result.tc2time
395 |                 row["tc2stats"] = result.tc2stats # this is a lot of data, toggle if we need all the outputs from gem5's stats.txt
396 |                 new_rows.append(row)
397 |             # pbar.update(len(batch))
398 |             melted = pd.DataFrame(new_rows)
399 |             melted.to_json(
400 |                 f"{cfg.output_dir}/melted_test_results.jsonl", 
401 |                 orient="records",
402 |                 lines=True
403 |             )
404 |             env.teardown()
405 |         ## if we get an exception, we still want to teardown the environment because it will likely leave a docker container running
406 |         except Exception as e:
407 |             print(e)
408 |             traceback.print_exc()
409 |             if threading.current_thread().name == "MainThread":
410 |                 # global env
411 |                 env.teardown()
412 |             raise e
413 |         
414 |         if not cfg.redo_src_tgt:
415 |             ## add back the src_code and tgt_code rows
416 |             melted = pd.concat([melted, src_tgt_rows])
417 |         
418 |         unmelted_df = unmelt_results(melted, cfg)
419 |         
420 |         unmelted_df.to_json(
421 |             f"{cfg.output_dir}/test_results.jsonl",
422 |             orient="records",
423 |             lines=True
424 |         )
425 |     else:
426 |         unmelted_df = pd.read_json(
427 |             f"{cfg.output_dir}/test_results.jsonl",
428 |             orient="records",
429 |             lines=True
430 |         )
431 |     
432 |     agg_df, result_df = report_results(unmelted_df, cfg, reference_df)
433 |     
434 |     agg_df.to_csv(
435 |         f"{cfg.output_dir}/aggregated_results.csv",
436 |         index=False
437 |     )
438 |     
439 |     result_df.to_json(
440 |         f"{cfg.output_dir}/addtl_stats.jsonl",
441 |         orient="records",
442 |         lines=True
443 |     )
444 |     
445 |     print(f"Results written to {cfg.output_dir}")
446 |     
447 | 
448 | @dataclass
449 | class EvaluationConfig:
450 |     model_generated_outputs_path: str
451 |     output_dir: str
452 |     reference_file_path: str
453 |     is_prompt_based: bool = False
454 |     model_generated_potentially_faster_code_col: str = "generated_answers"
455 |     slow_code_col: str = "src_code"
456 |     reference_code_col: str = "tgt_code"
457 |     cpuset_cpus: Optional[str] = None
458 |     do_eval: bool = False
459 |     cpus_available: int = 1
460 |     num_problems_to_evaluate: int = -1
461 |     threshold_accuracy: float = 1.0
462 |     redo_src_tgt: bool = False
463 |     num_generated_cols: int = None
464 | 
465 | def load_config(yaml_path: str) -> EvaluationConfig:
466 |     with open(yaml_path, 'r') as f:
467 |         config_dict = yaml.safe_load(f)
468 |     return EvaluationConfig(**config_dict)
469 | 
470 | if __name__ == "__main__":
471 |     parser = argparse.ArgumentParser()
472 |     parser.add_argument("--config_path", type=str, required=True)
473 |     args = parser.parse_args()
474 |     config = load_config(args.config_path)
475 |     main(config)


--------------------------------------------------------------------------------
/gem5/pytest_simulator.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from gem5 import simulator
  3 | from gem5.simulator import PieEnvironment, PieSingleResult, PiePairResult, make
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | from pprint import pprint
  7 | 
  8 | API_KEY="cdZ5TynkL5D7gCTFvzJT4YKu05aozTLp4GgIcK5"
  9 | 
 10 | example_1_code = """
 11 | #include 
 12 | #define REP(i, n) for (int i = 0; i < (n); i++)
 13 | using namespace std;
 14 | const int MOD = 998244353;
 15 | 
 16 | int main() {
 17 | 	cin.tie(0)->sync_with_stdio(false);
 18 | 
 19 | 	int n, k; cin >> n >> k;
 20 | 	vector l(k), r(k);
 21 | 	REP(i, k) cin >> l[i] >> r[i];
 22 | 	REP(i, k) r[i]++;
 23 | 
 24 | 	vector dp(n + 1, 0);
 25 | 	dp[0] = 1;
 26 | 	dp[1] = -1;
 27 | 	REP(i, n) {
 28 | 		if (i > 0)
 29 | 			dp[i] = (dp[i] + dp[i - 1]) % MOD;
 30 | 		REP(j, k) {
 31 | 			if (i + l[j] < n)
 32 | 				dp[i + l[j]] = (dp[i + l[j]] + dp[i]) % MOD;
 33 | 			if (i + r[j] < n)
 34 | 				dp[i + r[j]] = (((dp[i + r[j]] - dp[i]) % MOD) + MOD) % MOD;
 35 | 		}
 36 | 	}
 37 | 	cout << dp[n - 1] << endl;
 38 | 	return 0;
 39 | }
 40 | """
 41 | example_1_problem_id = "p02549"
 42 | 
 43 | example_2_code = """
 44 | #include
 45 | #include
 46 | typedef long long ll;
 47 | typedef unsigned int ui;
 48 | #define infin (ll)(998244353)
 49 | using namespace std;
 50 | int main()
 51 | {
 52 |    int n,k;
 53 |    cin>>n>>k;
 54 |    int l,r;
 55 |    vector dp(n+1,0); //0 to n
 56 |    vector >v;
 57 |    for(int j=0;j>l>>r;
 60 |       v.push_back({l,r});
 61 |    }
 62 |    dp[0]=1;;
 63 |    dp[1]=1;
 64 |    sort(v.begin(),v.end());
 65 |    auto z=v.begin();
 66 |    if ((*z).first==1)
 67 |       dp[2]=1;
 68 |    else
 69 |      dp[2]=0;
 70 |    for(int i=3;i<=n;i++)
 71 |    {
 72 |       dp[i]=dp[i-1];
 73 |       for (auto x:v)
 74 |       {
 75 |          if (i>x.first)
 76 |             dp[i]+=dp[i-x.first];
 77 |          else
 78 |             break;
 79 |          if (i-1>x.second)
 80 |             {
 81 |                dp[i]-=dp[i-1-x.second];
 82 |                if (dp[i]<0)
 83 |                   dp[i]+=infin;
 84 |             }
 85 |       }
 86 |       dp[i]=(dp[i]) % infin;
 87 |    }
 88 |    cout< 0.95
129 |          assert result.mean_acc_v1 > 0.95
130 | 
131 |          pprint(result.tc2time_v0)
132 |          pprint(result.tc2time_v1)
133 | 
134 |          print(
135 |                f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
136 |          print(
137 |                f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
138 |          print(
139 |                f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001026564396")
140 |          print(
141 |                f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001029346032")
142 | 
143 |          assert result.tc2time_v0[0] == 0.001035073468
144 |          assert result.tc2time_v0[1] == 0.001039205596
145 | 
146 |          assert result.tc2time_v1[0] == 0.001026564396
147 |          assert result.tc2time_v1[1] == 0.001029346032
148 | 
149 |          hyperfine_v0_tc2stats = result.tc2stats_binary_v0
150 |          hyperfine_v1_tc2stats = result.tc2stats_binary_v1
151 | 
152 |          for tc, time in hyperfine_v0_tc2stats.items():
153 |                tc2hyperfine_v0[tc].append(np.array(time))
154 |          for tc, time in hyperfine_v1_tc2stats.items():
155 |                tc2hyperfine_v1[tc].append(np.array(time))
156 | 
157 |       for tc, times_v0 in tc2hyperfine_v0.items():
158 |          mean_times_v0 = []
159 |          for time_list in times_v0:
160 |             mean_times_v0.append(np.mean(time_list))
161 |          mean_times_v1 = []
162 |          for time_list in tc2hyperfine_v1[tc]:
163 |             mean_times_v1.append(np.mean(time_list))
164 |          # consistency check
165 |          assert (np.std(mean_times_v0) / np.mean(mean_times_v0)
166 |                   ) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
167 |          assert (np.std(mean_times_v1) / np.mean(mean_times_v1)
168 |                   ) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
169 |          # performance check
170 |          assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)
171 |                   ) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
172 |          print(
173 |                f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
174 |          print(
175 |                f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
176 |          print(
177 |                f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
178 | 
179 |       assert len(tc2hyperfine_v0) == 2
180 |       assert len(tc2hyperfine_v1) == 2
181 | 
182 | 
183 |    def test_single_submission(self, get_pie_env):
184 |       env = get_pie_env
185 |       tc2hyperfine = defaultdict(list)
186 |       for _ in range(2): 
187 |          result = env.submit_single_submission(code=example_1_code,
188 |                                                 testcases=[0,1],
189 |                                                 problem_id=example_1_problem_id,
190 |                                                 timing_env="both")
191 | 
192 |          assert result.compilation == True 
193 |          assert result.tc2success[0] == True 
194 |          assert result.tc2success[1] == True 
195 |          assert result.tc2time[0] == 0.001035073468
196 |          assert result.tc2time[1] == 0.001039205596
197 |          assert result.mean_acc > 0.95
198 |          
199 |          hyperfine_result = result.tc2stats_binary
200 |          
201 |          for tc, results in hyperfine_result.items():
202 |             tc2hyperfine[tc].append(np.array(results))
203 | 
204 |       for tc, times in tc2hyperfine.items():
205 |          mean_times = []
206 |          for time_list in times:
207 |                mean_times.append(np.mean(time_list))
208 |          assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
209 |       assert len(tc2hyperfine) == 2
210 |       
211 | 
212 | 
213 |    def test_dual_submission_diff_code(self, get_pie_env):
214 |       env = get_pie_env
215 |       tc2hyperfine_v0 = defaultdict(list)
216 |       tc2hyperfine_v1 = defaultdict(list)
217 |       for _ in range(2): 
218 |          result = env.submit_single_submission_pair(code_v0=example_1_code, 
219 |                                                    code_v1=example_2_code,
220 |                                                    testcases=[0,1],
221 |                                                    problem_id=example_1_problem_id,
222 |                                                    timing_env="both")
223 |          
224 |          
225 |          assert result.compilation_v0 == True
226 |          assert result.compilation_v1 == True
227 |          
228 |          assert result.mean_acc_v0 > 0.95
229 |          assert result.mean_acc_v1 > 0.95
230 |          
231 |          pprint(result.tc2time_v0)
232 |          pprint(result.tc2time_v1)
233 |          
234 |          print(f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
235 |          print(f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
236 |          print(f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001026564396")
237 |          print(f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001029346032")
238 |          
239 |          assert result.tc2time_v0[0] == 0.001035073468
240 |          assert result.tc2time_v0[1] == 0.001039205596
241 |          
242 |          assert result.tc2time_v1[0] == 0.001026564396
243 |          assert result.tc2time_v1[1] == 0.001029346032
244 |          
245 |          hyperfine_v0_tc2stats = result.tc2stats_binary_v0
246 |          hyperfine_v1_tc2stats = result.tc2stats_binary_v1
247 | 
248 |          for tc, time in hyperfine_v0_tc2stats.items():
249 |                tc2hyperfine_v0[tc].append(np.array(time))
250 |          for tc, time in hyperfine_v1_tc2stats.items():
251 |                tc2hyperfine_v1[tc].append(np.array(time))
252 |                
253 |       for tc, times_v0 in tc2hyperfine_v0.items():
254 |          mean_times_v0 = []
255 |          for time_list in times_v0 :	
256 |                mean_times_v0.append(np.mean(time_list))
257 |          mean_times_v1 = []
258 |          for time_list in tc2hyperfine_v1[tc] :	
259 |                mean_times_v1.append(np.mean(time_list))
260 |          # consistency check
261 |          assert (np.std(mean_times_v0) / np.mean(mean_times_v0)) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
262 |          assert (np.std(mean_times_v1) / np.mean(mean_times_v1)) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
263 |          # performance check
264 |          assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
265 |          print(f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
266 |          print(f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
267 |          print(f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
268 | 
269 |       assert len(tc2hyperfine_v0) == 2
270 |       assert len(tc2hyperfine_v1) == 2
271 |    
272 |    
273 |    def test_dual_submission_same_code(self, get_pie_env):
274 |       env = get_pie_env
275 |       tc2hyperfine_v0 = defaultdict(list)
276 |       tc2hyperfine_v1 = defaultdict(list)
277 |       for _ in range(2): 
278 |          result = env.submit_single_submission_pair(code_v0=example_1_code, 
279 |                                                    code_v1=example_1_code,
280 |                                                    testcases=[0,1],
281 |                                                    problem_id=example_1_problem_id,
282 |                                                    timing_env="both")
283 |          
284 |          
285 |          assert result.compilation_v0 == True
286 |          assert result.compilation_v1 == True
287 |          
288 |          assert result.mean_acc_v0 > 0.95
289 |          assert result.mean_acc_v1 > 0.95
290 |          
291 |          pprint(result.tc2time_v0)
292 |          pprint(result.tc2time_v1)
293 |          
294 |          print(f"result.tc2time_v0[0] = {result.tc2time_v0[0]} should be 0.001035073468")
295 |          print(f"result.tc2time_v0[1] = {result.tc2time_v0[1]} should be 0.001039205596")
296 |          print(f"result.tc2time_v1[0] = {result.tc2time_v1[0]} should be 0.001035073468")
297 |          print(f"result.tc2time_v1[1] = {result.tc2time_v1[1]} should be 0.001039205596")
298 |          
299 |          assert result.tc2time_v0[0] == 0.001035073468
300 |          assert result.tc2time_v0[1] == 0.001039205596
301 |          
302 |          assert result.tc2time_v1[0] == 0.001035073468
303 |          assert result.tc2time_v1[1] == 0.001039205596
304 |          
305 |          hyperfine_v0_tc2stats = result.tc2stats_binary_v0
306 |          hyperfine_v1_tc2stats = result.tc2stats_binary_v1
307 | 
308 |          for tc, time in hyperfine_v0_tc2stats.items():
309 |                tc2hyperfine_v0[tc].append(np.array(time))
310 |          for tc, time in hyperfine_v1_tc2stats.items():
311 |                tc2hyperfine_v1[tc].append(np.array(time))
312 |                
313 |       for tc, times_v0 in tc2hyperfine_v0.items():
314 |          times_v1 = tc2hyperfine_v1[tc]
315 |          mean_times = []
316 |          for time_list in times_v0 + times_v1:	
317 |             mean_times.append(np.mean(time_list))
318 |          assert (np.std(mean_times) / np.mean(mean_times)) < 0.05, f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times}"
319 |          print(f"std/mean = {np.std(mean_times) / np.mean(mean_times)} for tc {tc} with mean times {mean_times} ")
320 |       assert len(tc2hyperfine_v0) == 2
321 | 
322 | 
323 |     
324 |    def test_multiple_single_submissions(self, get_pie_env):
325 |       
326 |       
327 |          code_list = [example_1_code, example_2_code] * 3
328 |          testcases_list = [[0, 1], [0, 1]] * 3
329 |          problem_id_list = [example_1_problem_id, example_2_problem_id] * 3
330 |          override_flags_list = ["", ""] * 3
331 |          
332 |          env = get_pie_env
333 | 
334 |          results = env.submit_multiple_single_submissions(code_list=code_list,
335 |                                                             testcases_list=testcases_list,
336 |                                                             problem_id_list=problem_id_list,
337 |                                                             override_flags_list=override_flags_list,
338 |                                                             timing_env="both")
339 | 
340 |          tc2hyperfine_v0 = defaultdict(list)
341 |          tc2hyperfine_v1 = defaultdict(list)
342 | 
343 |          for i, result in enumerate(results):
344 |             assert result.compilation == True 
345 |             assert result.tc2success[0] == True 
346 |             assert result.tc2success[1] == True 
347 |             
348 |             hyperfine_result = result.tc2stats_binary
349 | 
350 |             if (i % 2) == 0: 
351 |                assert result.tc2time[0] == 0.001035073468
352 |                assert result.tc2time[1] == 0.001039205596
353 |                tc2hyperfine = tc2hyperfine_v0
354 |             else: 
355 |                assert result.tc2time[0] == 0.001026564396
356 |                assert result.tc2time[1] == 0.001029346032
357 |                tc2hyperfine = tc2hyperfine_v1
358 |                
359 |             for tc, results in hyperfine_result.items():
360 |                tc2hyperfine[tc].append(np.array(results))
361 | 
362 |          for tc, times_v0 in tc2hyperfine_v0.items():
363 |             mean_times_v0 = []
364 |             for time_list in times_v0 :	
365 |                mean_times_v0.append(np.mean(time_list))
366 |             mean_times_v1 = []
367 |             for time_list in tc2hyperfine_v1[tc] :	
368 |                mean_times_v1.append(np.mean(time_list))
369 |       
370 |             print(f"std/mean v0 tc {tc}= {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0} ")
371 |             print(f"std/mean v1 tc {tc}= {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1} ")
372 |             print(f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1} for tc {tc}, with speedup {np.mean(mean_times_v0) / np.mean(mean_times_v1)}")
373 |       
374 |             # consistency check
375 |             assert (np.std(mean_times_v0) / np.mean(mean_times_v0)) < 0.05, f"std/mean = {np.std(mean_times_v0) / np.mean(mean_times_v0)} for tc {tc} with mean times {mean_times_v0}"
376 |             assert (np.std(mean_times_v1) / np.mean(mean_times_v1)) < 0.05, f"std/mean = {np.std(mean_times_v1) / np.mean(mean_times_v1)} for tc {tc} with mean times {mean_times_v1}"
377 |             # performance check
378 |             assert (np.mean(mean_times_v0) / np.mean(mean_times_v1)) > .95, f"mean_times_v0 {mean_times_v0} mean_times_v1 {mean_times_v1}"
379 |             
380 |       
381 |          assert len(tc2hyperfine_v0) == 2
382 |          assert len(tc2hyperfine_v1) == 2


--------------------------------------------------------------------------------
/gem5/template_config.yaml:
--------------------------------------------------------------------------------
1 | model_generated_outputs_path: "PATH_TO_YOUR_OUTPUTS"
2 | reference_file_path: "PATH_TO_REFERENCE_FILE_JSONL" # The path to the reference file. This should be the reference .jsonl file containing the reference outputs in addition to all other metadata in the test set file.
3 | output_dir: "PATH_TO_DIRECTORY_WHERE_YOU_WANT_TO_SAVE_EVALUATION_RESULTS"
4 | is_prompt_based: False # should always be False
5 | cpus_available: -1
6 | model_generated_potentially_faster_code_col: "generated_answers" # column in the model-generated outputs that contains the generated code, it should be a list of strings
7 | num_problems_to_evaluate: -1 # -1 means evaluate all problems
8 | 


--------------------------------------------------------------------------------
/openai_finetuning/README.md:
--------------------------------------------------------------------------------
 1 | The script `finetune_openai.py` was used to finetune GPT3.5 Turbo. Its usage is as follows:
 2 | 
 3 | ```bash
 4 | python finetune_openai.py PATH_TO_CONFIG.yaml
 5 | ```
 6 | 
 7 | We've included a sample config file `config.yaml` in this directory. The config file should contain the following fields:
 8 | 
 9 | ```yaml
10 | api_key: "YOUR_OPENAI_API_KEY"
11 | organization: "YOUR_OPENAI_ORGANIZATION (optional)"
12 | input_train_path: "PATH_TO_TRAINING_DATA"
13 | input_test_path: "PATH_TO_VALIDATION_DATA"
14 | max_train: -1
15 | max_val: -1
16 | max_len: -1
17 | epochs: NUMBER_OF_EPOCHS (we used 1)
18 | output_dir: "PATH_TO_OUTPUT_DIR"
19 | model_suffix: "SUFFIX_FOR_MODEL_NAME"
20 | ```
21 | 


--------------------------------------------------------------------------------
/openai_finetuning/finetune_openai.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd 
  2 | import os 
  3 | import sys 
  4 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
  5 | import pie_chatgpt
  6 | import re
  7 | import json
  8 | from typing import List, Dict
  9 | import yaml 
 10 | import logging 
 11 | import shutil 
 12 | import uuid 
 13 | import time 
 14 | import json
 15 | import os
 16 | from time import sleep
 17 | from io import StringIO
 18 | import openai 
 19 | 
 20 | 
 21 | 
 22 | def load_data(train_path, test_path, max_train, max_val):
 23 |     df_train = pd.read_json(train_path, lines=True, orient='records')
 24 |     df_train = df_train.sample(frac=1).reset_index(drop=True)
 25 |     df_train = df_train[:max_train]
 26 |     df_test = pd.read_json(test_path, lines=True, orient='records')
 27 |     df_test = df_test.sample(frac=1).reset_index(drop=True)
 28 |     df_test = df_test[:max_val]
 29 |     return df_train, df_test
 30 | 
 31 |         
 32 |         
 33 | def prepare_output(code_str, max_len=-1, tokenizer=None):
 34 |     # "\n+" -> "\n"
 35 |     if max_len > 0 and tokenizer: 
 36 |         code_str = code_str[:max_len]
 37 |     elif max_len > 0 and not tokenizer:
 38 |         raise ValueError("max_len > 0 but no tokenizer provided")
 39 |     return code_str
 40 |         
 41 |         
 42 | def prepare_dataset(df, src_code_col, tgt_code_col, max_len=-1, tokenizer=None, max_examples=-1):
 43 |     df = df.copy()
 44 |     if max_examples > 0:
 45 |         df = df.sample(frac=1).reset_index(drop=True)
 46 |         df = df[:max_examples]
 47 |     training_examples = []
 48 |     for i, row in df.iterrows():
 49 |         src_code = row[src_code_col]
 50 |         src_code_formatted = pie_chatgpt.ChatGPTWrapper.prepare_input(src_code)
 51 |         tgt_code = row[tgt_code_col]
 52 |         tgt_code_formatted = prepare_output(tgt_code, max_len=max_len, tokenizer=tokenizer)
 53 |         
 54 |         d = [
 55 |             {"role": "system", "content": "You are a helpful assistant that can optimize code."},
 56 |             {"role": "user", "content": src_code_formatted},
 57 |             {"role": "assistant", "content": tgt_code_formatted},
 58 |         ]
 59 |         training_examples.append({"messages": d})
 60 |     return training_examples
 61 | 
 62 | 
 63 | 
 64 | def save_dataset(training_examples: List[Dict], file_name: str):
 65 |     with open(file_name, 'w') as jsonl_file:
 66 |         for example in training_examples:
 67 |             jsonl_file.write(json.dumps(example) + '\n')
 68 | 
 69 | 
 70 | def register_file_openai(file_path, outpath, sleep_interval=30):
 71 |     logger.info(f"Registering file {file_path} to OpenAI")
 72 |     file_dict = openai.File.create(
 73 |         file=open(file_path, "rb"),
 74 |         purpose='fine-tune',
 75 |     )
 76 |     logger.info(f"File registered with id {file_dict['id']}")
 77 |     while file_dict['status'] != 'processed':
 78 |         file_dict = openai.File.retrieve(file_dict['id'])
 79 |         logger.info(f"File status: {file_dict['status']}")
 80 |         with open(outpath, 'w') as json_file:
 81 |             json.dump(file_dict, json_file)
 82 |         if file_dict['status'] != 'processed':
 83 |             logger.info(f"Sleeping for {sleep_interval} seconds")
 84 |         sleep(sleep_interval)
 85 |     return file_dict
 86 |     
 87 | 
 88 | def main(input_train_path, input_test_path, max_train, max_val, max_len, tokenizer,output_dir, model_suffix="pie_opt", epochs=1):
 89 |     logging.info(f"Input train path: {input_train_path}; epochs: {epochs}")
 90 |     if not os.path.exists(output_dir):
 91 |             os.makedirs(output_dir)
 92 |     df_train, df_test = load_data(input_train_path, input_test_path, max_train, max_val)
 93 |     logger.info(f"Loaded {len(df_train)} training examples and {len(df_test)} test examples")
 94 |     training_examples = prepare_dataset(df_train, "src_code", "tgt_code", max_len=max_len, tokenizer=tokenizer)
 95 |     if os.path.exists(os.path.join(output_dir, "train.jsonl")):
 96 |         unique_id = uuid.uuid4()
 97 |         logger.warning(f"File {os.path.join(output_dir, 'train.jsonl')} already exists, copying to {os.path.join(output_dir, f'train_{unique_id}.jsonl')}")
 98 |         shutil.copy(os.path.join(output_dir, "train.jsonl"), os.path.join(output_dir, f"train_{unique_id}.jsonl"))
 99 |     save_dataset(training_examples, os.path.join(output_dir, "train.jsonl"))
100 |     training_examples = prepare_dataset(df_test, "src_code", "tgt_code", max_len=max_len, tokenizer=tokenizer)
101 |     if os.path.exists(os.path.join(output_dir, "test.jsonl")):
102 |         unique_id = uuid.uuid4()
103 |         logger.warning(f"File {os.path.join(output_dir, 'test.jsonl')} already exists, copying to {os.path.join(output_dir, f'test_{unique_id}.jsonl')}")
104 |         shutil.copy(os.path.join(output_dir, "test.jsonl"), os.path.join(output_dir, f"test_{unique_id}.jsonl"))
105 |     save_dataset(training_examples, os.path.join(output_dir, "test.jsonl"))
106 |     train_data = register_file_openai(os.path.join(output_dir, "train.jsonl"), os.path.join(output_dir, "openai_train_file.json"))
107 |     val_data = register_file_openai(os.path.join(output_dir, "test.jsonl"), os.path.join(output_dir, "openai_val_file.json"))
108 |     train_data, val_data = wait_on_data(train_data, val_data)
109 |     assert train_data['status'] == 'processed'
110 |     assert val_data['status'] == 'processed'
111 |     with open(os.path.join(output_dir, "openai_train_file.json"), 'w') as train_json_file, open(os.path.join(output_dir, "openai_val_file.json"), 'w') as val_json_file:
112 |         json.dump(train_data, train_json_file)
113 |         json.dump(val_data, val_json_file)
114 |     
115 |     model = openai.FineTuningJob.create(
116 |         model = "gpt-3.5-turbo", 
117 |         training_file = train_data['id'],
118 |         validation_file = val_data['id'],
119 |         suffix = model_suffix, 
120 |         hyperparameters = {"n_epochs": epochs}
121 |     )
122 |     logging.info(f"Model {model['id']} created")
123 |     logging.info(f"Model dict: {model}")
124 |     monitor_model(model, output_dir)
125 |     return model     
126 |     
127 | def wait_on_data(train_data, val_data, max_timeout = 600, sleep_interval=10):
128 |     start = time.time()
129 |     while train_data['status'] != 'processed' or val_data['status'] != 'processed':
130 |         train_data = openai.File.retrieve(train_data['id'])
131 |         val_data = openai.File.retrieve(val_data['id'])
132 |         logger.info(f"Train data status: {train_data['status']} status_details: {train_data['status_details']}")
133 |         logger.info(f"Val data status: {val_data['status']}, status_details: {val_data['status_details']}")
134 |         if time.time() - start > max_timeout:
135 |             raise TimeoutError("Timeout waiting for data")
136 |         logger.info(f"Sleeping for {sleep_interval} seconds")
137 |         sleep(sleep_interval)
138 |     return train_data, val_data
139 |     
140 | 
141 | def get_step_metrics(file_id):
142 |     content = openai.File.download(file_id)
143 |     eval_result = StringIO(content.decode())
144 |     df = pd.read_csv(eval_result, sep=",")
145 |     return df
146 | 
147 | 
148 | def handle_get_step_metrics(file_id, output_dir):
149 |     content = openai.File.download(file_id)
150 |     eval_result = StringIO(content.decode())
151 |     try: 
152 |         df = pd.read_csv(eval_result, sep=",")
153 |         df.to_csv(os.path.join(output_dir, f"success_{file_id}.csv"), index=False)
154 |         return df
155 |     except Exception as e:
156 |         error_message = f"Error reading file {file_id}: {e}\n"
157 |         file_content_message = f"File content: {content}\n"
158 |         file_content_decoded_message = f"File content decoded: {content.decode()}\n"
159 |         eval_result_content_message = f"Eval result content: {eval_result.getvalue()}\n"
160 | 
161 |         with open(os.path.join(output_dir, f"error_{file_id}.txt"), 'w') as error_file:
162 |             error_file.write(error_message)
163 |             error_file.write(file_content_message)
164 |             error_file.write(file_content_decoded_message)
165 |             error_file.write(eval_result_content_message)
166 |         
167 |         logger.error(error_message)
168 |         logger.error(file_content_message)
169 |         logger.error(file_content_decoded_message)
170 |         logger.error(eval_result_content_message)
171 | 
172 |         return None
173 |     
174 | SAMPLE_CPP_PROGRAM_TO_OPTIMIZE = """
175 | #include 
176 | #include 
177 | #include 
178 | #include 
179 | 
180 | int main(int argc, char** argv) {
181 |     int n = 1000000;
182 |     int* a = (int*) malloc(n * sizeof(int));
183 |     int* b = (int*) malloc(n * sizeof(int));
184 |     int* c = (int*) malloc(n * sizeof(int));
185 |     for (int i = 0; i < n; i++) {
186 |         a[i] = i;
187 |         b[i] = i;
188 |     }
189 |     for (int i = 0; i < n; i++) {
190 |         c[i] = a[i] + b[i];
191 |     }
192 |     printf("%d", c[0]);
193 |     free(a);
194 |     free(b);
195 |     free(c);
196 |     return 0;
197 | }
198 | """
199 | 
200 | 
201 | 
202 | 
203 | def monitor_model(model_dict, output_dir, sleep_interval=30): 
204 |     model = openai.FineTuningJob.retrieve(model_dict['id'])
205 |     logger.info(f"Model status: {model['status']}")
206 |     while model['status'] != 'succeeded':
207 |         model = openai.FineTuningJob.retrieve(model_dict['id'])
208 |         logger.info(f"Model status: {model['status']}")
209 |         if model['status'] != 'succeeded':
210 |             logger.info(f"Sleeping for {sleep_interval} seconds")
211 |         if "result_files" in model:
212 |             for file_id in model['result_files']:
213 |                 if file_id != None:
214 |                     result = openai.File.download(file_id)
215 |                     with open(os.path.join(output_dir, f"result_{file_id}.csv"), 'wb') as result_file:
216 |                         result_file.write(result)
217 |                         logging.info(f"Result file {file_id} saved to {os.path.join(output_dir, f'result_{file_id}.json')}")
218 |                     try: 
219 |                         df = pd.read_csv(os.path.join(output_dir, f"result_{file_id}.csv"))
220 |                         last_row = df.iloc[-1]
221 |                         logger.info(f"Last row: {last_row}")
222 |                     except Exception as e:
223 |                         logger.error(f"Error reading file {file_id}: {e}")
224 |                         logger.error(f"File content: {result}")
225 |                         logger.error(f"File content decoded: {result.decode()}")
226 | 
227 |         with open(os.path.join(output_dir, "openai_model.json"), 'w') as json_file:
228 |             json.dump(model, json_file)
229 |         sleep(sleep_interval)
230 |         
231 |     if "result_files" in model:
232 |         for file_id in model['result_files']:
233 |             if file_id is not None:
234 |                 result = openai.File.download(file_id)
235 |                 with open(os.path.join(output_dir, f"result_{file_id}.csv"), 'wb') as result_file:  # 'wb'
236 |                     result_file.write(result)
237 |                 logging.info(f"Result file {file_id} saved to {os.path.join(output_dir, f'result_{file_id}.json')}")
238 |                 
239 |     with open(os.path.join(output_dir, "openai_model.json"), 'w') as json_file:
240 |         json.dump(model, json_file)
241 |     
242 |     # parse the clock time 
243 |     # finished_at = model['finished_at']
244 |     # started_at = model['started_at']
245 |     # total_time = finished_at - started_at
246 |     finished_at = model.get('finished_at', None)
247 |     started_at = model.get('started_at', None)
248 |     if finished_at is not None and started_at is not None:
249 |         total_time = finished_at - started_at
250 |         logging.info(f"Model {model['id']} finished in {total_time / 60} minutes")
251 |     if "trained_tokens" in model:
252 |         logging.info(f"Model {model['id']} trained tokens: {model['trained_tokens']}")
253 |         
254 |     logging.info(f"Model {model['id']} fine-tuned model: {model['fine_tuned_model']}")
255 |     
256 |     
257 |     chat_log = [
258 |         {"role": "system", "content": "You are a helpful assistant that can optimize code."},
259 |         {"role": "user", "content": pie_chatgpt.ChatGPTWrapper.prepare_input(SAMPLE_CPP_PROGRAM_TO_OPTIMIZE)},
260 |     ]
261 |     
262 |     try: 
263 |         response = openai.ChatCompletion.create(
264 |             model=model['fine_tuned_model'],
265 |             messages=chat_log,
266 |             max_tokens=1000,
267 |             temperature=0.0,
268 |         )
269 |         logging.info(f"************************")
270 |         logging.info(f"Input program: {SAMPLE_CPP_PROGRAM_TO_OPTIMIZE}")
271 |         logging.info("************************")
272 |         logging.info(f"Output program: {response['choices'][0]['message']['content']}")
273 |     except Exception as e:
274 |         logging.error(f"Error calling OpenAI API: {e}")
275 |         logging.error(f"Chat log: {chat_log}")
276 |     
277 |     return model
278 |     
279 | 
280 | def load_config(yaml_path):
281 |     with open(yaml_path, 'r') as file:
282 |         config = yaml.safe_load(file)
283 |     return config
284 | 
285 | 
286 | 
287 | if __name__ == "__main__":
288 |     import transformers 
289 |     tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
290 |     
291 |     if len(sys.argv) > 1:
292 |         config_path = sys.argv[1]
293 |     else: 
294 |         raise ValueError("No config path provided")
295 |     config = load_config(config_path)
296 |     
297 |     openai.api_key = config['api_key']
298 |     if 'organization' in config and config['organization']:
299 |         openai.organization = config['organization']
300 |     
301 |     assert len(config['model_suffix']) > 0 and len(config['model_suffix']) < 19, "model_suffix must be between 1 and 18 characters"
302 |     
303 |     logger = logging.getLogger(__name__)
304 |     ## log date and time
305 |     if not os.path.exists(config['output_dir']):
306 |         os.makedirs(config['output_dir'])
307 |     logging.basicConfig(
308 |     level=logging.INFO, 
309 |     format='%(asctime)s %(message)s',
310 |     handlers=[
311 |         logging.FileHandler(os.path.join(config['output_dir'], 'chatgpt_fine_tuning.log')),
312 |         logging.StreamHandler()
313 |     ]
314 |     )
315 |     
316 |     logging.info(f"Config: {config}")
317 |         
318 |     main(
319 |         input_train_path=config['input_train_path'],
320 |         input_test_path=config['input_test_path'],
321 |         max_train=config['max_train'],
322 |         max_val=config['max_val'],
323 |         max_len=config['max_len'],
324 |         tokenizer=tokenizer,
325 |         output_dir=config['output_dir'],
326 |         model_suffix=config['model_suffix'], 
327 |         epochs=config['epochs']
328 |     )
329 | 
330 | 


--------------------------------------------------------------------------------
/openai_finetuning/openai_config.yaml:
--------------------------------------------------------------------------------
 1 | api_key: ""
 2 | organization: ""
 3 | input_train_path: ""
 4 | input_test_path: ""
 5 | max_train: -1
 6 | max_val: -1
 7 | max_len: -1
 8 | epochs: 1
 9 | output_dir: ""
10 | model_suffix: ""


--------------------------------------------------------------------------------
/openai_finetuning/pie_chatgpt.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import openai
  3 | import random
  4 | import tiktoken
  5 | from tqdm import tqdm
  6 | from typing import List
  7 | from concurrent.futures import ThreadPoolExecutor
  8 | 
  9 | random.seed(42)
 10 | 
 11 | 
 12 | def retry_with_exponential_backoff(
 13 |     func,
 14 |     initial_delay: float = 1,
 15 |     exponential_base: float = 2,
 16 |     jitter: bool = True,
 17 |     max_retries: int = 10,
 18 |     errors: tuple = (
 19 |         openai.error.RateLimitError,
 20 |         openai.error.ServiceUnavailableError,
 21 |     ),
 22 | ):
 23 |     """Retry a function with exponential backoff."""
 24 | 
 25 |     def wrapper(*args, **kwargs):
 26 |         # Initialize variables
 27 |         num_retries = 0
 28 |         delay = initial_delay
 29 | 
 30 |         # Loop until a successful response or max_retries is hit or an exception is raised
 31 |         while True:
 32 |             try:
 33 |                 return func(*args, **kwargs)
 34 | 
 35 |             # Retry on specified errors
 36 |             except errors as e:
 37 |                 # Increment retries
 38 |                 num_retries += 1
 39 | 
 40 |                 # Check if max retries has been reached
 41 |                 if num_retries > max_retries:
 42 |                     raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
 43 | 
 44 |                 # Increment the delay
 45 |                 delay *= exponential_base * (1 + jitter * random.random())
 46 | 
 47 |                 # Sleep for the delay
 48 |                 time.sleep(delay)
 49 |                 print(f"\nRetrying after {delay:.2f} seconds.")
 50 | 
 51 |             # Raise exceptions for any errors not specified
 52 |             except Exception as e:
 53 |                 raise e
 54 | 
 55 |     return wrapper
 56 | 
 57 | 
 58 | class ChatGPTWrapper:
 59 |     """A Wrapper for ChatGPT model interaction."""
 60 | 
 61 |     @staticmethod
 62 |     def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
 63 |         """
 64 |         Calculate the number of tokens in a text string.
 65 | 
 66 |         Args:
 67 |         - string (str): The text string to be tokenized.
 68 |         - encoding_name (str, optional): The encoding name for tokenization. Defaults to "cl100k_base".
 69 |         Returns:
 70 |         - int: Number of tokens in the string.
 71 |         """
 72 |         encoding = tiktoken.get_encoding(encoding_name)
 73 |         num_tokens = len(encoding.encode(string))
 74 |         return num_tokens
 75 | 
 76 |     @staticmethod
 77 |     @retry_with_exponential_backoff
 78 |     def call_openai_api(
 79 |         slow_code_str: str, max_tokens: int = 1024, temperature: float = 0.0
 80 |     ) -> str:
 81 |         """
 82 |         Calls the OpenAI API to optimize a given code.
 83 | 
 84 |         Args:
 85 |         - slow_code_str (str): The code string that needs to be optimized.
 86 | 
 87 |         - max_tokens (int, optional): The maximum number of tokens to be used for generation. Defaults to 1024.
 88 |         
 89 |         - temperature (float, optional): The temperature value for generation. Defaults to 0.0.
 90 | 
 91 |         Returns:
 92 |         - str: Optimized code returned by the OpenAI API.
 93 |         """
 94 |         # Initialize the chat log with system and user inputs
 95 |         start_chat_log = [
 96 |             {"role": "system", "content": "You are a helpful assistant that can optimize code."},
 97 |             {"role": "user", "content": ChatGPTWrapper.prepare_input(slow_code_str)},
 98 |         ]
 99 |         # Call the OpenAI API with the given chat log
100 |         response = openai.ChatCompletion.create(
101 |             model="gpt-3.5-turbo-0613",
102 |             messages=start_chat_log,
103 |             max_tokens=max_tokens,
104 |             temperature=temperature,
105 |         )
106 |         # Extract the optimized code from the response
107 |         return response["choices"][0]["message"]["content"]
108 | 
109 |     @staticmethod
110 |     def prepare_input(slow_code_str: str) -> str:
111 |         """
112 |         Prepares the input for the OpenAI API by framing the code to be optimized.
113 | 
114 |         Args:
115 |         - slow_code_str (str): The code string that needs to be framed for optimization.
116 | 
117 |         Returns:
118 |         - str: Formatted input for the OpenAI API.
119 |         """
120 |         prompt = f"""// slower version::
121 | 
122 | {slow_code_str}
123 | 
124 | // optimized version of the same code:
125 | 
126 | """
127 |         return prompt
128 | 
129 | 
130 | QUESTION_PREFIX = "# slower version:\n\n"
131 | ANSWER_PREFIX = "# optimized version of the same code:\n\n"
132 | 
133 | 
134 | 
135 | def main(input_file: str, output_file: str):
136 |     # Read the jsonl file using pandas
137 |     df = pd.read_json(input_file, lines=True)
138 | 
139 |     # Ensure src_code is in the dataframe
140 |     if 'src_code' not in df.columns:
141 |         raise ValueError("'src_code' column not found in the input file.")
142 |     
143 |     # Optimize code using multiple threads
144 |     df['optimized_code'] = optimize_code_parallel(df['src_code'].tolist())
145 |     
146 |     # Save the dataframe to a new jsonl file
147 |     df.to_json(output_file, orient='records', lines=True)
148 | 
149 | 
150 | def optimize_code_parallel(code_list: List[str], max_workers: int = 5) -> List[str]:
151 |     """
152 |     Function to optimize code using multiple threads.
153 |     
154 |     Args:
155 |     - code_list (List[str]): List of code strings to optimize.
156 |     - max_workers (int): Number of worker threads.
157 |     
158 |     Returns:
159 |     - List[str]: List of optimized code strings.
160 |     """
161 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
162 |         optimized_code_list = list(tqdm(executor.map(ChatGPTWrapper.call_openai_api, code_list), total=len(code_list)))
163 |     return optimized_code_list
164 | 
165 | if __name__ == "__main__":
166 |     import sys
167 | 
168 |     if len(sys.argv) != 3:
169 |         print("Usage: python pie_chatgpt.py  ")
170 |         sys.exit(1)
171 |     main(input_file=sys.argv[1], output_file=sys.argv[2])


--------------------------------------------------------------------------------
/retrieval/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Retrieval
2 | 
3 | A notebook that can be used to prepare the retrieval dataset is `retrieval.ipynb`. Given a training dataset and the test set examples to optimize, it will retrieve the K most similar training examples pairs for the given test set examples. The retrieved pairs are then used to prompt the model for optimized outputs.


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | The file `sample_and_eval.py` contains an example of how to chain together sampling from a huggingface model and then use those generations for evaluation. It takes in a yaml file in the form like `template_config.yaml`. We caution the use of this as during our work, we ran all parts separately, and this example is mainly for illustrative purposes. It is possible the code has errors, because we did not run it in this form.


--------------------------------------------------------------------------------
/scripts/sample_and_eval.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import time
  3 | import logging
  4 | import sys
  5 | import yaml
  6 | import shutil
  7 | import os
  8 | 
  9 | def start_generation_container(model, volume, max_best_of, port=4242, startup_timeout=600):
 10 |     # command = f"docker run --detach --gpus all --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
 11 |     # with 1,2,3,4,5,6,7 gpus 
 12 |     if not model.startswith("codellama"):
 13 |         model = f"data/{model}"
 14 |     # the first command may be 
 15 |     command = f"docker run --detach --gpus 1,2,3,4,5,6,7 --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
 16 |     # use the following line for podman or potentially for a different docker installation, the nvidia-docker command may vary 
 17 |     # command = f"docker run --detach -e NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 --shm-size 1g -p {port}:80 -v {volume}:/data ghcr.io/huggingface/text-generation-inference:latest --model-id {model} --max-best-of {max_best_of}"
 18 |     container_id = subprocess.check_output(command, shell=True).decode().strip()
 19 |     # wait until the logs say Connected
 20 |     while True:
 21 |         logging.info(f"Waiting for container to start with id {container_id} and timeout {startup_timeout} left")
 22 |         logs = subprocess.check_output(f"docker logs {container_id}", shell=True).decode()
 23 |         if "Connected" in logs:
 24 |             break
 25 |         time.sleep(5)
 26 |         startup_timeout -= 5
 27 |         if startup_timeout <= 0:
 28 |             raise TimeoutError("Timeout waiting for container to start")
 29 |     return container_id
 30 | 
 31 | def stop_generation_container(container_id):
 32 |     subprocess.run(f"docker stop {container_id}", shell=True)
 33 | 
 34 | def remove_generation_container(container_id):
 35 |     subprocess.run(f"docker rm {container_id}", shell=True)
 36 |     
 37 | 
 38 | def sample_from_container(test_file, output_file, do_sample, num_samples=8, max_new_tokens=1000, temperature=0.7, num_threads=20, prompt_name="code_opt"): 
 39 |     logging.info(f"Sampling from container with test_file {test_file} and output_file {output_file}")
 40 |     command = f"python finetuning/sample.py --test_file {test_file} --output_file {output_file} --do_sample {do_sample} --num_samples {num_samples} --max_new_tokens {max_new_tokens} --temperature {temperature} --num_threads {num_threads} --prompt_name {prompt_name}"
 41 |     logging.info(f"Running command {command}")
 42 |     p = subprocess.run(command, shell=True)
 43 |     logging.info(f"sample.py returned with code {p.returncode}")
 44 |     return p.returncode
 45 | 
 46 | def run_eval(eval_args):
 47 |     eval_args["model_generated_outputs_path"] = sampling_args["output_file"]
 48 |     eval_output_dir = eval_args["output_dir"]
 49 |     if not os.path.exists(eval_output_dir):
 50 |         os.makedirs(eval_output_dir)
 51 |     else: 
 52 |         logging.warning(f"Output directory {eval_output_dir} already exists, overwriting")
 53 |     with open(os.path.join(eval_output_dir, "config.yaml"), "w") as f:
 54 |         yaml.dump(eval_args, f)
 55 |     logging.info(f"Running eval with args {eval_args}")
 56 |     cmd = f"python gem5/gem5_eval.py --config_path {os.path.join(eval_output_dir, 'config.yaml')}"
 57 |     logging.info(f"Running command {cmd}")
 58 |     p = subprocess.run(cmd, shell=True)
 59 |     logging.info(f"gem5_eval.py returned with code {p.returncode}")
 60 |     logging.info("Done")
 61 |     
 62 | 
 63 | def main(): 
 64 |     cfg_path = sys.argv[1]
 65 |     with open(cfg_path, 'r') as f:
 66 |         cfg = yaml.load(f)
 67 |     text_gen_args = cfg["text_gen_args"]
 68 |     sampling_args = cfg["sampling_args"]
 69 |     eval_args = cfg["eval_args"]
 70 | 
 71 |     # Check if the output directory for evaluation exists
 72 |     if os.path.exists(eval_args['output_dir']):
 73 |         logging.info(f"Output directory {eval_args['output_dir']} already exists. Skipping the entire script.")
 74 |         return
 75 | 
 76 |     # Check if the output file from sampling exists
 77 |     if os.path.exists(sampling_args['output_file']):
 78 |         logging.info(f"Output file {sampling_args['output_file']} from sampling already exists. Skipping container startup and sampling.")
 79 |     else:
 80 |         # Start the container and perform sampling
 81 |         logging.info(f"Starting generation container with args {text_gen_args}")
 82 |         container_id = start_generation_container(text_gen_args["generation_model_name"], text_gen_args["volume_mount"], text_gen_args["max_best_of"], port=text_gen_args["port"])
 83 |         logging.info(f"Sampling from container with args {sampling_args}")
 84 |         sample_from_container(**sampling_args)
 85 |         # Stop and remove the container
 86 |         logging.info(f"Stopping container with id {container_id}")
 87 |         stop_generation_container(container_id)
 88 |         logging.info(f"Removing container with id {container_id}")
 89 |         remove_generation_container(container_id)
 90 |         logging.info("Successfully removed container")
 91 | 
 92 |     # Run evaluation
 93 |     logging.info(f"Setting model_generated_outputs_path to {sampling_args['output_file']} and running eval with args {eval_args}")
 94 |     run_eval(eval_args)
 95 | 
 96 |     
 97 | if __name__ == "__main__":
 98 |     main()
 99 |     
100 |     
101 |     
102 |     
103 |     
104 |     
105 |     


--------------------------------------------------------------------------------
/scripts/template_config.yaml:
--------------------------------------------------------------------------------
 1 | text_gen_args:
 2 |   generation_model_name: "your_model_name"
 3 |   volume_mount: "/path/to/your/volume"
 4 |   max_best_of: 5
 5 |   port: 4242
 6 | 
 7 | sampling_args:
 8 |   test_file: "/path/to/your/test_file"
 9 |   output_file: "/path/to/your/output_file"
10 |   do_sample: true
11 |   num_samples: 8
12 |   max_new_tokens: 1000
13 |   temperature: 0.7
14 |   num_threads: 20
15 |   prompt_name: "code_opt"
16 | 
17 | eval_args:
18 |   output_dir: "/path/to/your/evaluation_output_directory"
19 |   is_prompt_based: false
20 |   cpus_available: -1
21 |   model_generated_potentially_faster_code_col: "generated_answers"
22 |   num_problems_to_evaluate: -1
23 | 


--------------------------------------------------------------------------------