├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── data.py ├── dataset ├── README.md └── convert.py ├── demo.gif ├── evaluate.py ├── hf_model.png ├── icon.png ├── interact.py ├── model.py ├── requirements.txt ├── train.py └── trainer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dataset/source_code/ 2 | model/ 3 | venv/ 4 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ![](icon.png) AutoCoder 2 | 3 | Contributions welcome 4 | 5 | #### A basic and simple tool for code auto completion, fine-tuned from the pytorch [pre-trained GPT-2 variants](https://huggingface.co/transformers/pretrained_models.html) offered by the awesome [🤗 transformers](https://github.com/huggingface/transformers) library. 6 | 7 | ### Demo 8 | ![demo](demo.gif) 9 | 10 | ### [Play on 🤗HF's Model Hub](https://huggingface.co/congcongwang/gpt2_medium_fine_tuned_coder?text=%3Cpython%3E+def+factorial)👇 11 | 12 | ![](hf_model.png) 13 | 14 | ### Features 15 | - Write with Python or Java. 16 | 17 | ### Blog linked to this project 18 | - [The details of dataset construction and fine-tuning process](https://wangcongcong123.github.io/AutoCoder/) 19 | 20 | ### Quick Start 21 | Here provides three ways of quick-start. Before that, 22 | 23 | 24 | #### Load from 🤗transformers models 25 | Now there are [two fine-tuned models](https://huggingface.co/models?search=congcongwang) uploded to 🤗transformers models library. They can be used easily as long as you `pip install transformers` 26 | 27 | 28 | ```python 29 | from transformers import AutoTokenizer,AutoModelWithLMHead 30 | tokenizer = AutoTokenizer.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder") 31 | model = AutoModelWithLMHead.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder") 32 | # or 33 | # tokenizer = AutoTokenizer.from_pretrained("congcongwang/distilgpt2_fine_tuned_coder") 34 | # model = AutoModelWithLMHead.from_pretrained("congcongwang/distilgpt2_fine_tuned_coder") 35 | use_cuda=True 36 | context="def factorial" 37 | lang="python" # can be java as well. 38 | 39 | if use_cuda: 40 | model.to("cuda") 41 | 42 | input_ids = tokenizer.encode(" " + context, 43 | return_tensors='pt') if lang == "python" else tokenizer.encode( 44 | " " + context, return_tensors='pt') 45 | outputs = model.generate(input_ids=input_ids.to("cuda") if use_cuda else input_ids, 46 | max_length=128, 47 | temperature=0.7, 48 | num_return_sequences=1) 49 | 50 | decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) 51 | print(decoded) 52 | ``` 53 | 54 | 55 | 56 | #### Ready-to-go Interaction 57 | ``` 58 | git clone https://github.com/wangcongcong123/auto_coding.git 59 | pip install -r requirements.txt 60 | ``` 61 | 62 | 1. Download the fine-tuned models, here are two versions provided. 63 | * [distilgpt2_fine_tuned_coder (params: 82M, size: 291MB)](https://ucdcs-student.ucd.ie/~cwang/autocoder/distilgpt2_fine_tuned_coder.zip) 64 | * [gpt2_medium_fine_tuned_coder.zip (params: 345M, size: 1.22GB)](https://ucdcs-student.ucd.ie/~cwang/autocoder/gpt2_medium_fine_tuned_coder.zip) 65 | 2. Unzip the model and move it to `./model` (create it first) 66 | 3. Run the interact: `python interact.py` 67 | 68 | #### Fine-tuning yours 69 | ``` 70 | git clone 71 | pip install -r requirements.txt 72 | ``` 73 | 74 | 1. Preparing [the dataset](./dataset) 75 | 2. Start fine-tuning model: `python train.py --model_select distilgpt2` 76 | 3. After fine-tuning, the model will be saved to `./model/distilgpt2_fine_tuned_coder/0_GPTSingleHead` which is exactly the fine-tuned version as provided in Ready-to-go Interaction. 77 | 78 | \* For more params setting of training, `python train.py -h` 79 | 80 | ### Generative examples 81 | Good Python generation examples by fine-tuned GPT2-medium 82 | ```python 83 | -------------Example 1-------------------------------------- 84 | Context code: def factorial 85 | Generated: 86 | def factorial(n): 87 | return 1 if n == 1 else n * factorial(n - 1) 88 | 89 | --------------Example 2------------------------------------- 90 | Context code: def sum_of_series(num_of_terms:int, power: int) 91 | Generated: 92 | def sum_of_series(num_of_terms:int, power: int) -> float: 93 | """Calculate Sum of Series. 94 | >>> sum_of_series(1, 1, 10) 95 | 55.0 96 | >>> sum_of_series(1, 10, 100) 97 | 49600.0 98 | """ 99 | sum = (num_of_terms / 2) * power 100 | return sum 101 | 102 | --------------Example 3------------------------------------- 103 | Context code: def quick_sort(collection) 104 | Generated: 105 | def quick_sort(collection): 106 | """ 107 | >>> quick_sort([0, 5, 3, 2, 2]) 108 | [0, 2, 2, 3, 5] 109 | >>> quick_sort([]) 110 | [] 111 | >>> quick_sort([-2, -5, -45]) 112 | [-45, -5, -2] 113 | """ 114 | length = len(collection) 115 | if length <= 1: 116 | return collection 117 | else: 118 | # Use the last element as the first pivot 119 | pivot = collection.pop() 120 | # Put elements greater than pivot in greater list 121 | # Put elements lesser than pivot in lesser list 122 | greater, lesser = [], [] 123 | for element in collection: 124 | if element > pivot: 125 | ... 126 | ``` 127 | Good Java generation examples by fine-tuned GPT2-medium 128 | 129 | ```javascript 130 | --------------Example 1------------------------------------- 131 | Context code: Scanner input= new Scanner(System.in) 132 | Generated: 133 | Scanner input= new Scanner(System.in) 134 | System.out.println("Enter the Octal number: "); 135 | // Take octal number as input from user in a string 136 | String oct = input.next(); 137 | 138 | --------------Example 2------------------------------------- 139 | Context code: private static int CountCharacters(String str) 140 | Generated: 141 | private static int CountCharacters(String str) { 142 | return str.replaceAll("\\s", "").length(); 143 | } 144 | ``` 145 | \* Although some generated examples look good, it needs to take a grain of salt to judge the model's actual performance. The model may simply **"remembers"** existing code in the training set well. 146 | 147 | ### TODO list 148 | - Expand the dataset (and construct the dataset more carefeully) and increase context window. Try larger generative models like GPT-2 large or even [GPT-3 variants](https://arxiv.org/abs/2005.14165) as proposed recently if the computational resources are allowed. 149 | - Remove overlapping between training examples and dev examples for contamination studies. That says, to what extent the model memorizes examples rigidly or [at surface heuristics level during training](https://arxiv.org/pdf/1902.01007.pdf). 150 | - Try some adversarial examples (more complicated for model's reasoning capability testing purpose) to test the robustness of the model. 151 | - Integrate this into real-life use case such as a code editor - [Sublime Text](https://www.sublimetext.com/), where a threshold of joint probability may need to be studied for code snippet recommendations. 152 | - Try some ideas of location-aware code generation. For example, if a human coder is sitting writing a comment, the autocoder should be aware of the coder's context (left and right if available) to help complete the corresponding content. 153 | - Model size and inference efficiency is a problem in real-life use cases. 154 | - Do research in this problem domain to grab a general idea of what work has done in the literature for this particular problem. 155 | 156 | 157 | 158 | ### Extra notes 159 | * For mutli-GPU training, it only works when torch==1.4.0. It will be not working when torch==1.5.0. No idea so far how to fix this issue. 160 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import os, pickle, json 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | from tqdm import tqdm 7 | 8 | class SrcCodeDataset(Dataset): 9 | def __init__(self, file_path, model, cache_path=None): 10 | """ 11 | this dataset class is used to load source code dataset in batch for fine-tuning with GPT2LMModel 12 | :param model: the model that the dataset will be fed to 13 | """ 14 | self.inputs = [] 15 | load_cache = False 16 | if cache_path != None: 17 | load_cache = self._load_cache(cache_path) 18 | if not load_cache: 19 | self._build(file_path, model) 20 | if cache_path != None: 21 | self._cache(cache_path) 22 | 23 | def __len__(self): 24 | return len(self.inputs) 25 | 26 | def __getitem__(self, index): 27 | input_ids = self.inputs[index]["input_ids"] 28 | # input_mask = self.inputs[index]["attention_mask"] we don't need attention_mask for this task 29 | # return {"input_ids": input_ids, "input_mask": input_mask} 30 | return {"input_ids": input_ids} 31 | 32 | def _load_cache(self, cache_path): 33 | load_cache = False 34 | if os.path.isdir(cache_path): 35 | if os.path.isfile(os.path.join(cache_path, "inputs.pk")): 36 | with open(os.path.join(cache_path, "inputs.pk"), "rb") as f: 37 | logger.info( 38 | f" load cached token ids of model from {cache_path}") 39 | self.inputs = pickle.load(f) 40 | load_cache = True 41 | return load_cache 42 | 43 | def _cache(self, cache_path): 44 | if not os.path.isdir(cache_path): 45 | os.makedirs(cache_path) 46 | with open(os.path.join(cache_path, "inputs.pk"), "wb") as f: 47 | pickle.dump(self.inputs, f) 48 | logger.info( 49 | f" save tokenized ids of samples to: {cache_path}/inputs.pk") 50 | 51 | def _build(self, file_path, model): 52 | with open(file_path) as f: 53 | for line in tqdm(f): 54 | example = json.loads(line.strip()) 55 | if example["label"].lower() == "python": 56 | encoded_plus = model.tokenizer.encode_plus( 57 | model.tokenize("") + example["token_ids"] + [model.eos_token_id], 58 | max_length=model.max_seq_length) 59 | elif example["label"].lower() == "java": 60 | encoded_plus = model.tokenizer.encode_plus( 61 | model.tokenize("") + example["token_ids"] + [model.eos_token_id], 62 | max_length=model.max_seq_length) 63 | self.inputs.append(encoded_plus.data) 64 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | ### Download directly from [here](https://ucdcs-student.ucd.ie/~cwang/autocoder/source_code.zip) 2 | 3 | unzip the source_code file and move it under this directory. 4 | 5 | 6 | ### Or build dataset from scratch 7 | This allows to customize dataset building. Below is an example of the building process. 8 | 9 | Let's use Python and Java codes from [The Algorithms project](https://github.com/TheAlgorithms) as the dataset. We want AutoCoder to help auto-complete codes at a general level. The codes of The Algorithms suits the need! Another reason is personally thinking this code from this project is well written (high-quality codes!). 10 | 11 | ##### download source code 12 | ``` 13 | git clone https://github.com/TheAlgorithms/Python 14 | git clone https://github.com/TheAlgorithms/Java 15 | ``` 16 | 17 | ##### Move the dowloaded two folders into here this `dataset/` directory and then run 18 | 19 | ``` 20 | python convert.py --segment_len 256 --stride 10 --dev_size 0.1 21 | ``` 22 | 23 | You will find a train set named train.jsonl and dev set named dev.jsonl under `source_code/json/`. 24 | 25 | Have a look at the `convert.py` script for the specific process of dataset construction or quickly read [this blog](#). 26 | 27 | -------------------------------------------------------------------------------- /dataset/convert.py: -------------------------------------------------------------------------------- 1 | import glob, json, os, argparse 2 | from tqdm import tqdm 3 | from sklearn.model_selection import train_test_split 4 | from transformers import GPT2Tokenizer 5 | 6 | if __name__ == '__main__': 7 | 8 | parser = argparse.ArgumentParser(description='Params') 9 | parser.add_argument('--segment_len', type=int, default=254, 10 | help='the length of each example') 11 | # we set this to be 254 instead of 256 because we want the input to be like: input_ids 12 | parser.add_argument('--stride', type=int, default=10, 13 | help='stride to split training examples') 14 | parser.add_argument('--dev_size', type=float, default=0.1, 15 | help='split ratio of development set for each language') 16 | args = parser.parse_args() 17 | 18 | gpt2_tok = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False) 19 | paths = ['Python', 'Java'] 20 | segments = {} 21 | 22 | for path in paths: 23 | source_files = glob.glob(f'{path}/**/*.py' if path == "Python" else f'{path}/**/*.java', recursive=True) 24 | for each_src in tqdm(source_files): 25 | with open(each_src, "r", encoding="utf-8") as f: 26 | code_content = f.read() 27 | encoded = gpt2_tok.encode(code_content) 28 | for i in range(len(encoded) // args.stride): 29 | seg = encoded[i * args.stride:i * args.stride + args.segment_len] 30 | if path not in segments: 31 | segments[path] = [] 32 | segments[path].append(json.dumps({"token_ids": seg, "label": path})) 33 | 34 | train, dev = [], [] 35 | for key in segments: 36 | # we don't shuffle before splitting because we want the train and dev to be very different (less overlapping) 37 | tr, de = train_test_split(segments[key], test_size=args.dev_size) 38 | train += tr 39 | dev += de 40 | 41 | to_path = "source_code/json" 42 | if not os.path.isdir(to_path): 43 | os.makedirs(to_path) 44 | 45 | with open(os.path.join(to_path, "train.jsonl"), "w") as f: 46 | f.write("\n".join(train)) 47 | 48 | with open(os.path.join(to_path, "dev.jsonl"), "w") as f: 49 | f.write("\n".join(dev)) 50 | -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/demo.gif -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | import torch 3 | import logging 4 | from torch.utils.data import DataLoader 5 | from tqdm import tqdm 6 | 7 | logging.basicConfig( 8 | format=logging.BASIC_FORMAT, 9 | datefmt='%Y-%m-%d %H:%M:%S', 10 | level=logging.INFO 11 | ) 12 | logger = logging.getLogger(__name__) 13 | 14 | class SingleCLMEvaluator(): 15 | def __init__(self, dataloader: DataLoader = None, 16 | data_tag: str = "dev", 17 | device: int = None, tokenizer=None, early_stop_on: str = "perplexity"): 18 | 19 | if data_tag not in ["dev", "train", "test"]: 20 | raise ValueError("data_tag has to be one of dev, train or test") 21 | assert early_stop_on in ["loss", "perplexity"] 22 | self.early_stop_on = early_stop_on 23 | self.dataloader = dataloader 24 | self.data_tag = data_tag 25 | self.tokenizer = tokenizer 26 | 27 | self.n_gpu = torch.cuda.device_count() 28 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | if device == -1: 30 | self.n_gpu = 0 31 | self.device = torch.device("cpu") 32 | 33 | def reset_dataloader(self, dataloader: DataLoader): 34 | self.dataloader = dataloader 35 | 36 | def reset_logger(self, output_path): 37 | pass 38 | 39 | def __call__(self, model, collate_fn, output_path: str = None, epoch: int = -1, steps: int = -1, 40 | target_names: List[str] = None, do_predict: bool = False) -> Dict[ 41 | str, float]: 42 | 43 | if do_predict and self.tokenizer == None: 44 | raise ValueError("you are doing predict so need a tokenizer") 45 | if self.dataloader is None: 46 | raise ValueError(" need to set dataloader for this evaluator, call reset_dataloader()") 47 | 48 | model.eval() 49 | if epoch == -1 and steps == -1: 50 | logger.info( 51 | f"\nEvaluation the model on {self.data_tag} dataset") 52 | else: 53 | logger.info( 54 | "\nEvaluation the model on " + self.data_tag + " dataset" + f" in epoch {epoch} after {steps} steps:") 55 | 56 | self.dataloader.collate_fn = collate_fn 57 | total_loss = 0.0 58 | total_steps = 0 59 | 60 | for step, batch in enumerate(tqdm(self.dataloader, desc="evaluating")): 61 | input = batch["features"] 62 | # batch to device 63 | for feature_name, ids in input.items(): 64 | input[feature_name] = ids.to(self.device) 65 | 66 | with torch.no_grad(): 67 | loss, logits = model(input) 68 | loss = loss.mean() 69 | total_loss += loss 70 | 71 | total_steps += 1 72 | eval_loss = total_loss / total_steps 73 | eval_results = {"loss": eval_loss} 74 | 75 | perplexity = torch.exp(torch.tensor(eval_loss)).clone().detach() 76 | eval_results["perplexity"] = perplexity.mean().item() 77 | return eval_results 78 | -------------------------------------------------------------------------------- /hf_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/hf_model.png -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/icon.png -------------------------------------------------------------------------------- /interact.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 2 | import argparse 3 | 4 | if __name__ == '__main__': 5 | 6 | parser = argparse.ArgumentParser(description='Params') 7 | parser.add_argument('--model_path', type=str, default="model/gpt2_medium_fine_tuned_coder", 8 | help='the path to load fine-tuned model') 9 | parser.add_argument('--max_length', type=int, default=128, 10 | help='maximum length for code generation') 11 | parser.add_argument('--temperature', type=float, default=0.7, 12 | help='temperature for sampling-based code geneeration') 13 | parser.add_argument( 14 | "--use_cuda", action="store_true", help="inference with gpu?" 15 | ) 16 | 17 | args = parser.parse_args() 18 | 19 | # load fine-tunned model and tokenizer from path 20 | model = GPT2LMHeadModel.from_pretrained(args.model_path) 21 | tokenizer = GPT2Tokenizer.from_pretrained(args.model_path) 22 | 23 | model.eval() 24 | if args.use_cuda: 25 | model.to("cuda") 26 | 27 | # now the fine-tunned model supports two programming languages, namely, python and java 28 | def lang_select(): 29 | lang = "" 30 | while lang not in ["python", "java"]: 31 | print('Enter the programming language you prefer (python or java)') 32 | lang = input(">>> ").lower() 33 | return lang 34 | 35 | 36 | lang = lang_select() 37 | 38 | context = "" 39 | while context != "exit": 40 | print(f'You are using {lang} now. Enter the context code (exit or change_lang)') 41 | context = input(">>> ") 42 | 43 | if context == "change_lang": 44 | lang = lang_select() 45 | 46 | print(f"You are using {lang} now. Enter the context code") 47 | context = input(">>> ") 48 | 49 | input_ids = tokenizer.encode(" " + context, 50 | return_tensors='pt') if lang == "python" else tokenizer.encode( 51 | " " + context, return_tensors='pt') 52 | outputs = model.generate(input_ids=input_ids.to("cuda") if args.use_cuda else input_ids, 53 | max_length=args.max_length, 54 | temperature=args.temperature, 55 | num_return_sequences=1) 56 | for i in range(1): 57 | decoded = tokenizer.decode(outputs[i], skip_special_tokens=True) 58 | # ends with occurence of double new lines (to meet the convention of code completion) 59 | if "\n\n" in decoded: 60 | decoded = decoded[:decoded.index("\n\n")] 61 | 62 | print('Generated {}: {}'.format(i, decoded)) 63 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 2 | import json 3 | from typing import Dict 4 | import os 5 | import torch 6 | from torch import nn, Tensor 7 | 8 | import logging 9 | logging.basicConfig( 10 | format=logging.BASIC_FORMAT, 11 | datefmt='%Y-%m-%d %H:%M:%S', 12 | level=logging.INFO 13 | ) 14 | logger = logging.getLogger(__name__) 15 | 16 | class GPTSingleHead(nn.Module): 17 | """ 18 | Different from directly using GPT2LMHeadModel, this wraps up GPT2LMHeadModel as well as GPT2Tokenizer 19 | """ 20 | def __init__(self, model_name_or_path: str, max_seq_length: int = 256, do_lower_case: bool = False, 21 | special_words_to_add=None): 22 | super(GPTSingleHead, self).__init__() 23 | self.config_keys = ['max_seq_length', 'do_lower_case'] 24 | self.do_lower_case = do_lower_case 25 | if max_seq_length > 1024: 26 | logging.warning( 27 | "GPT only allows a max_seq_length of 1024. Value will be set to 1024") 28 | max_seq_length = 1024 29 | self.max_seq_length = max_seq_length 30 | self.gpt = GPT2LMHeadModel.from_pretrained(model_name_or_path) 31 | self.tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 32 | if special_words_to_add != None: 33 | self.add_special_words(special_words_to_add) 34 | 35 | self.bos_token_id=self.tokenizer.bos_token_id 36 | self.eos_token_id=self.tokenizer.eos_token_id 37 | # self.pad_token_id=self.tokenizer.pad_token_id 38 | 39 | def tokenize(self, text: str): # default for cls 40 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 41 | 42 | def add_special_words(self, special_words_to_add): 43 | orig_num_tokens = len(self.tokenizer) 44 | num_added_tokens = self.tokenizer.add_special_tokens(special_words_to_add) 45 | if num_added_tokens > 0: 46 | self.gpt.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) 47 | 48 | def forward(self, input: Dict[str, torch.Tensor]): 49 | loss, logits=self.gpt(input["input_ids"],labels=input["input_ids"])[:2] 50 | return loss, logits 51 | 52 | def get_config_dict(self): 53 | return {key: self.__dict__[key] for key in self.config_keys} 54 | 55 | def padding_features(self, features_dict_list): 56 | """ 57 | padding features for a batch 58 | :param features_dict_list: i.e., batch 59 | :return: padded batch features 60 | """ 61 | max_input_len_this_batch = 0 62 | 63 | batch_features = {feature_name: [] for feature_name in features_dict_list[0]} 64 | for feature_dict in features_dict_list: 65 | for feature_name, feature_ids in feature_dict.items(): 66 | if feature_name == "input_ids" and len(feature_ids) > max_input_len_this_batch: 67 | max_input_len_this_batch = len(feature_ids) 68 | batch_features[feature_name].append(feature_ids) 69 | 70 | padded_batch_features = {feature_name: [] for feature_name in features_dict_list[0]} 71 | for feature_name, batch_ids in batch_features.items(): 72 | 73 | for each_ids in batch_ids: 74 | padded = each_ids + [self.tokenizer.pad_token_id] * (max_input_len_this_batch - len(each_ids)) 75 | padded_batch_features[feature_name].append(padded) 76 | 77 | for feature_name, ids in padded_batch_features.items(): 78 | padded_batch_features[feature_name] = torch.tensor(ids) 79 | 80 | return padded_batch_features 81 | 82 | def get_embedding_dimension(self) -> int: 83 | return self.gpt.config.hidden_size 84 | 85 | def get_config(self) -> int: 86 | return self.gpt.config 87 | 88 | def save(self, output_path: str): 89 | self.gpt.save_pretrained(output_path) 90 | self.tokenizer.save_pretrained(output_path) 91 | with open(os.path.join(output_path, 'gpt_sh_config.json'), 'w') as f: 92 | json.dump(self.get_config_dict(), f, indent=2) 93 | 94 | def reload(self, input_path: str): 95 | """reload from checkpoint weights""" 96 | return GPTSingleHead.load(input_path + "/0_GPTSingleHead") 97 | 98 | @staticmethod 99 | def load(input_path: str): 100 | if not os.path.isfile(os.path.join(input_path, 'gpt_sh_config.json')): 101 | raise ValueError("In the model path does not find gpt_sh_config.json file, you may have not trained yet") 102 | with open(os.path.join(input_path, 'gpt_sh_config.json')) as f: 103 | config = json.load(f) 104 | return GPTSingleHead(model_name_or_path=input_path, **config) 105 | 106 | 107 | class EmptyHeads(nn.Module): 108 | def __init__(self): 109 | self.config_keys=[] 110 | super().__init__() 111 | 112 | def forward(self, input: Dict[str, Tensor]): 113 | return input 114 | 115 | def get_config_dict(self): 116 | return {key: self.__dict__[key] for key in self.config_keys} 117 | 118 | def save(self, output_path): 119 | with open(os.path.join(output_path, 'empty_heads_config.json'), 'w') as f: 120 | json.dump(self.get_config_dict(), f, indent=2) 121 | torch.save(self.state_dict(), os.path.join(output_path, 'empty_heads.pt')) 122 | 123 | def load_saved(self, input_path): 124 | self.load_state_dict(torch.load(os.path.join(input_path, '1_EmptyHeads', 'empty_heads.pt'))) 125 | 126 | @staticmethod 127 | def load(input_path,config): 128 | if not os.path.isfile(os.path.join(input_path, 'empty_heads_config.json')): 129 | raise ValueError( 130 | "In the model path does not find empty_heads_config.json file, you may have not trained yet") 131 | 132 | with open(os.path.join(input_path, 'empty_heads_config.json')) as f: 133 | config = json.load(f) 134 | model = EmptyHeads() 135 | 136 | if not os.path.isfile(os.path.join(input_path, 'empty_heads.pt')): 137 | raise ValueError("In the model path does not find state of file, you need to train and get weights first") 138 | 139 | model.load_state_dict(torch.load(os.path.join(input_path, 'empty_heads.pt'))) 140 | return model 141 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | sklearn 3 | transformers 4 | torch==1.4.0 5 | numpy 6 | wandb -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse, os 2 | import logging 3 | 4 | logging.basicConfig( 5 | format=logging.BASIC_FORMAT, 6 | datefmt='%Y-%m-%d %H:%M:%S', 7 | level=logging.INFO 8 | ) 9 | logger = logging.getLogger(__name__) 10 | 11 | MODEL_MAP = {"distilgpt2": "distilgpt2", "gpt2": "gpt2", "gpt2_medium": "gpt2-medium", 12 | "gpt2_large": "gpt2-large"} 13 | 14 | from model import GPTSingleHead 15 | from trainer import ModelTrainer 16 | from data import SrcCodeDataset 17 | from evaluate import SingleCLMEvaluator 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser(description='Hyper params') 21 | parser.add_argument('--model_select', type=str, default="distilgpt2", 22 | help='model select from distilgpt2, gpt2_medium, gpt2, or gpt2_large') 23 | parser.add_argument('--dataset_name', type=str, default="source_code", 24 | help='dataset name whatever name you put into the ./dataset directory (by default: source_code)') 25 | parser.add_argument('--per_gpu_train_batch_size', type=int, default=4, 26 | help='input batch size for training') 27 | parser.add_argument('--dev_batch_size', type=int, default=8, 28 | help='input batch size for development') 29 | parser.add_argument('--num_epochs_train', type=int, default=16, 30 | help='number of epochs to train') 31 | parser.add_argument('--max_seq_length', type=int, default=256, 32 | help='maximum sequence length of samples in a batch for training') 33 | parser.add_argument('--lr', type=float, default=2e-5, 34 | help='learning rate') 35 | parser.add_argument('--warmup_ratio', type=float, default=0.2, 36 | help='warmup_ratio') 37 | parser.add_argument('--early_stop', type=int, default=20, 38 | help='early_stop') 39 | parser.add_argument('--scheduler', type=str, default="warmuplinear", 40 | help='scheduler') 41 | parser.add_argument('--seed', type=int, default=122, 42 | help='random seed') 43 | parser.add_argument('--accumulation_steps', type=int, default=1, 44 | help='accumulation steps if you want large batch size but can not fit in the memory allowed') 45 | parser.add_argument('--n_gpu', type=int, default=1, 46 | help='number of gpu for training') 47 | parser.add_argument('--visiable_device', type=str, default="0", 48 | help='visiable gpus for training, should be consistent with n_gpu') 49 | parser.add_argument('--evaluation_steps', type=int, default=200, 50 | help='evaluation_steps') 51 | parser.add_argument('--wandb_project_name', type=str, default="code_generate", 52 | help='project name for wandb') 53 | parser.add_argument( 54 | "--restore_training", action="store_true", help="restore training if a saved checkopint exists" 55 | ) 56 | parser.add_argument( 57 | "--with_wandb", action="store_true", help="Train with wandb tracking." 58 | ) 59 | 60 | args = parser.parse_args() 61 | logger.info(f"args: {args}") 62 | dataset_folder = f"dataset/{args.dataset_name}/json/" 63 | assert args.model_select in MODEL_MAP.keys(), (f"model has to be in {MODEL_MAP.keys()}") 64 | output_path = f"model/{args.model_select}_fine_tuned_coder" 65 | logger.info("{} for dataset in: {}".format(output_path, dataset_folder)) 66 | logger.info( 67 | f"*****************model select: {args.model_select} for code generation using dataset: {args.dataset_name}******************") 68 | # add more params for wandb 69 | args.wandb_run_name = output_path 70 | #initialize model by model name (the same as used in transformers lib) 71 | model = GPTSingleHead(MODEL_MAP[args.model_select], max_seq_length=args.max_seq_length) 72 | #add special tokens for controlling code generation by different programming language 73 | model.add_special_words({"pad_token": "", "additional_special_tokens": ["", ""]}) 74 | #load training dataset 75 | file_path = dataset_folder + "train.jsonl" 76 | train_dataset = SrcCodeDataset(file_path, model, cache_path=os.path.join(".cache", output_path, "train")) 77 | #load developlemt dataset 78 | file_path = dataset_folder + "dev.jsonl" 79 | dev_dataset = SrcCodeDataset(file_path, model, cache_path=os.path.join(".cache", output_path, "dev")) 80 | # initialize development evaluator 81 | dev_evaluator = SingleCLMEvaluator() 82 | # initialize model trainer 83 | model_trainer = ModelTrainer(model, 84 | train_dataset=train_dataset, 85 | dev_dataset=dev_dataset, 86 | dev_evaluator=dev_evaluator, 87 | scheduler=args.scheduler, 88 | epochs=args.num_epochs_train, 89 | per_gpu_train_batch_size=args.per_gpu_train_batch_size, 90 | output_path=output_path, 91 | optimizer_params={'lr': args.lr, 'eps': 1e-6, 'correct_bias': False}, 92 | evaluation_steps=args.evaluation_steps, 93 | early_stop=args.early_stop, 94 | dev_batch_size=args.dev_batch_size, 95 | restore_training=args.restore_training, 96 | accumulation_steps=args.accumulation_steps, 97 | n_gpu=args.n_gpu, 98 | visiable_device=args.visiable_device, 99 | warmup_ratio=args.warmup_ratio, 100 | seed=args.seed, 101 | data_loader_shuffle=True, 102 | wandb_config=args if args.with_wandb else None) 103 | #start training 104 | model_trainer.train() 105 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | import shutil 5 | import sys 6 | from typing import Type, Dict 7 | 8 | import torch 9 | import transformers 10 | 11 | try: 12 | from torch.utils.tensorboard import SummaryWriter 13 | except ImportError: 14 | from tensorboardX import SummaryWriter 15 | from torch import nn 16 | from torch.utils.data import DataLoader 17 | from torch.utils import data 18 | from torch.optim.optimizer import Optimizer 19 | from tqdm import trange, tqdm 20 | 21 | from dateutil.relativedelta import relativedelta 22 | 23 | import random 24 | import numpy as np 25 | import logging 26 | from model import EmptyHeads 27 | 28 | logging.basicConfig( 29 | format=logging.BASIC_FORMAT, 30 | datefmt='%Y-%m-%d %H:%M:%S', 31 | level=logging.INFO 32 | ) 33 | logger = logging.getLogger(__name__) 34 | 35 | from datetime import datetime 36 | 37 | try: 38 | import wandb 39 | 40 | wandb.ensure_configured() 41 | if wandb.api.api_key is None: 42 | _has_wandb = False 43 | wandb.termwarn("W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.") 44 | else: 45 | _has_wandb = False if os.getenv("WANDB_DISABLED") else True 46 | except ImportError: 47 | _has_wandb = False 48 | 49 | 50 | def set_seed(seed, n_gpu): 51 | logger.info(f" see seed for random, numpy and torch {seed}") 52 | random.seed(seed) 53 | np.random.seed(seed) 54 | torch.manual_seed(seed) 55 | if n_gpu > 0: 56 | torch.cuda.manual_seed_all(seed) 57 | 58 | 59 | def print_model_state_dict(model): 60 | for param_tensor in model.state_dict(): 61 | logger.info(f"{param_tensor}\t{model.state_dict()[param_tensor].size()}") 62 | 63 | 64 | def print_optimizer_state_dict(optimizer): 65 | for var_name in optimizer.state_dict(): 66 | logger.info(f"{var_name}\t{optimizer.state_dict()[var_name]}") 67 | 68 | 69 | def count_params(model: torch.nn.Module, print_details: bool = False): 70 | trainable_count = 0 71 | total_count = 0 72 | if isinstance(model, torch.nn.Sequential): 73 | for index in model._modules: 74 | if print_details: 75 | print_model_state_dict(model._modules[index]) 76 | logger.info(model._modules[index]) 77 | trainable_count += sum(p.numel() for p in model._modules[index].parameters() if p.requires_grad) 78 | total_count += sum(p.numel() for p in model._modules[index].parameters()) 79 | else: 80 | if print_details: 81 | print_model_state_dict(model) 82 | logger.info(model) 83 | total_count = sum(p.numel() for p in model.parameters()) 84 | trainable_count = sum(p.numel() for p in model.parameters() if p.requires_grad) 85 | logger.info(f' Total params: {total_count}') 86 | logger.info(f' Trainable params: {trainable_count}') 87 | logger.info(f' Non-trainable params: {total_count - trainable_count}') 88 | 89 | 90 | def batch_to_device(batch, device, keep_label=False): 91 | features = batch['features'] 92 | if isinstance(features, dict): 93 | for feature_name in features: 94 | features[feature_name] = features[feature_name].to(device) 95 | else: 96 | for inx in range(len(features)): 97 | for feature_name in features[inx]: 98 | features[inx][feature_name] = features[inx][feature_name].to(device) 99 | 100 | label_space = batch['labels'] 101 | if label_space == None: # for tasks like lm, labels are none. 102 | return features, None 103 | if not keep_label: 104 | labels = {"label_space_" + str(inx): label_space[inx].to(device) if torch.is_tensor(label_space[inx]) else 105 | label_space[inx] for inx in range(len(label_space))} 106 | else: 107 | labels = label_space 108 | return features, labels 109 | 110 | 111 | def is_wandb_available(): 112 | return _has_wandb 113 | 114 | 115 | class CollateFunction(): 116 | def __init__(self, up_model): 117 | self.up_model = up_model 118 | 119 | def __call__(self, batch): 120 | if isinstance(batch[0], dict): 121 | padded_features = self.up_model.padding_features(batch) 122 | return {'features': padded_features, 123 | "labels": None} # label_ids are in features, this task does not need labels, we set 124 | 125 | 126 | class ModelTrainer(): 127 | def __init__(self, up_model: nn.Module, down_layer: nn.Module = None, train_dataset=None, 128 | dev_dataset=None, dev_evaluator=None, 129 | epochs: int = 1, 130 | visiable_device: str = "0", 131 | scheduler: str = 'warmuplinear', 132 | warmup_ratio: float = 0.1, 133 | optimizer_class: Type[Optimizer] = transformers.AdamW, 134 | optimizer_params: Dict[str, object] = {'lr': 5e-5, 'eps': 1e-6, 'correct_bias': False}, 135 | weight_decay: float = 0.01, 136 | early_stop: int = 20, 137 | # 20 evaluation steps without improving on the early_stop_on metric as specified in dev_evaluator 138 | evaluation_steps: int = 500, 139 | output_path: str = None, 140 | save_best_model: bool = True, 141 | max_grad_norm: float = 1, 142 | fp16: bool = False, 143 | accumulation_steps=1, 144 | fp16_opt_level: str = 'O1', 145 | seed: int = 122, 146 | data_loader_shuffle=True, 147 | device: str = None, 148 | dev_batch_size: int = -1, # the same as train_batch_size 149 | n_gpu: int = None, 150 | report_model: bool = True, 151 | per_gpu_train_batch_size: int = 8, 152 | restore_training: bool = False, 153 | local_rank: int = -1, 154 | wandb_config=None): 155 | """ 156 | this trainer is written for training a sequential model that contains an upstream_layer (usually transformers) 157 | and a downstream_layer (usually task-specific heads like FF, RNN, CNN for encoding the output of upstram_layer) 158 | 159 | :param up_model: transformers like transformers.GPT2LMHeadModel or transformers.BERTModel 160 | :param down_layer: None if up_model already wraps up with an output encoder such as LMHead in GPT2LMHeadModel, else nn.Module for encoding the output of up_model 161 | :param train_dataset: train_dataset, it can be either instance of torch.data.Dataset or IterableDataset (defined in data.py) 162 | :param dev_dataset: dev_dataset, it can be either instance of torch.data.Dataset or IterableDataset 163 | :param dev_evaluator: dev_evaluator, evaluator on dev_dataset for early stop and performance tracking during training (defined in evaluate.py) 164 | :param epochs: number of epoches for training 165 | :param visiable_device: devices chosen to perform training 166 | :param scheduler: scheduler specially from transformers: see options in self._get_scheduler 167 | :param warmup_ratio: warmup_ratio ratio for learning rate over total training steps 168 | :param optimizer_class: transformers.AdamW de byfault 169 | :param optimizer_params: optimizer params 170 | :param weight_decay:weight decay 171 | :param early_stop:early stop steps 172 | :param evaluation_steps:logging steps 173 | :param output_path: path to save the checkpoint with the best performance as specified in early_stop_on in dev_evaluator instance 174 | :param save_best_model:save best checkpoint or the latest checkpoint 175 | :param max_grad_norm:max grad norm 176 | :param fp16: fp16 training 177 | :param accumulation_steps:accumulation steps 178 | :param fp16_opt_level:fp16 opt level 179 | :param seed:random seed for reproducibility 180 | :param data_loader_shuffle:Whether to shuffle data_loader of training dataset and dev dataset after epoch ends 181 | :param device: device for training, None or gpu for gpu training, cpu for gpu training 182 | :param dev_batch_size: development batch size, usually larger than training batch size due to no grads calculation and hence less burden on memory 183 | :param n_gpu: number of gpus for training 184 | :param report_model:if report model's structure and number of trainable params in logging 185 | :param per_gpu_train_batch_size: what it means literally 186 | :param restore_training: if restore training if the training process is interupped due to some accidents 187 | :param local_rank:for distributed training 188 | :param wandb_config: wandb logging if not none, else without wandb logging 189 | """ 190 | 191 | self.up_model = up_model 192 | if down_layer == None: 193 | # In this example, the upstream_layer already integrate the downstream head (namely, simple LM head as in transformers.GPT2LMHeadModel) 194 | # EmptyHeads is created here only for placeholder purpose 195 | down_layer = EmptyHeads() 196 | 197 | self.down_layer = down_layer 198 | assert output_path != None 199 | output_path = os.path.join("tmp", output_path) 200 | # os.makedirs(output_path,exist_ok=True) 201 | if restore_training: 202 | if not os.listdir(output_path): 203 | raise ValueError(f"no checkpoint found in {output_path}") 204 | else: 205 | logger.info(" loading embedding weights from saved checkpoint") 206 | self.up_model = self.up_model.reload( 207 | output_path) # for other transformers (apart from bert), the load_saved function has not been added 208 | 209 | logger.info(" loading downstream weights from saved checkpoint") 210 | self.down_layer.load_saved(output_path) 211 | with open(output_path + "/ck_report.json") as f: 212 | self.ck_report = json.load(f) 213 | 214 | self.model = torch.nn.Sequential(self.up_model, self.down_layer) 215 | 216 | if is_wandb_available() and wandb_config != None: 217 | # keep track of model topology and gradients if is_wandb_available and args!=None 218 | wandb.init(project=wandb_config.wandb_project_name, config=wandb_config, name=wandb_config.wandb_run_name) 219 | wandb.watch( 220 | (self.up_model, self.down_layer), log_freq=max(100, evaluation_steps) 221 | ) 222 | self.wandb_config = wandb_config 223 | 224 | self._restore_training = restore_training 225 | self.early_stop = early_stop 226 | 227 | self._dev_evaluator = dev_evaluator 228 | 229 | self._evaluation_steps = evaluation_steps 230 | self._save_best_model = save_best_model 231 | self._max_grad_norm = max_grad_norm 232 | 233 | os.makedirs(output_path, exist_ok=True) 234 | if os.listdir(output_path) and not restore_training: 235 | out = input( 236 | "Output directory ({}) already exists and is not empty, you wanna remove it before start? (y/n)".format( 237 | output_path)) 238 | if out == "y": 239 | shutil.rmtree(output_path) 240 | os.makedirs(output_path, exist_ok=True) 241 | else: 242 | raise ValueError("Output directory ({}) already exists and is not empty".format( 243 | output_path)) 244 | 245 | logFormatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 246 | fileHandler = logging.FileHandler(os.path.join(output_path, "log.out"), mode="a") 247 | fileHandler.setFormatter(logFormatter) 248 | logger.addHandler(fileHandler) 249 | self._dev_evaluator.reset_logger(output_path) 250 | 251 | self.output_path = output_path 252 | 253 | if device is None or device == "cuda": 254 | if torch.cuda.is_available(): 255 | device = torch.device("cuda") 256 | n_gpu = 1 if n_gpu == 1 else torch.cuda.device_count() 257 | else: 258 | logger.warning("no cuda is found in your machine, now use cpu") 259 | device = torch.device("cpu") 260 | n_gpu = 0 261 | elif device == "cpu": 262 | device = torch.device("cpu") 263 | n_gpu = 0 264 | else: 265 | raise ValueError("set device to be None, cuda or cpu") 266 | assert n_gpu <= torch.cuda.device_count() 267 | 268 | logger.info("Use pytorch device: {}, with gpu_number={}".format(device, n_gpu)) 269 | 270 | self._train_batch_size = per_gpu_train_batch_size * max(1, n_gpu) 271 | self._dev_batch_size = dev_batch_size if dev_batch_size != -1 else self._train_batch_size 272 | 273 | if isinstance(train_dataset, data.IterableDataset): 274 | self._train_dataloader = DataLoader(train_dataset, batch_size=None) 275 | self._steps_per_epoch = len(self._train_dataloader.dataset) 276 | else: 277 | self._train_dataloader = DataLoader(train_dataset, shuffle=data_loader_shuffle, 278 | batch_size=self._train_batch_size) 279 | self._steps_per_epoch = len(self._train_dataloader) 280 | 281 | if isinstance(dev_dataset, data.IterableDataset): 282 | dev_dataloader = DataLoader(dev_dataset, batch_size=None) 283 | else: 284 | dev_dataloader = DataLoader(dev_dataset, shuffle=data_loader_shuffle, batch_size=self._dev_batch_size) 285 | 286 | if accumulation_steps > 1: 287 | self._steps_per_epoch = self._steps_per_epoch // accumulation_steps 288 | 289 | self._dev_data = dev_dataset 290 | self._dev_evaluator.reset_dataloader(dev_dataloader) 291 | 292 | self.collate_fn = CollateFunction(self.up_model) 293 | # Use customize batching 294 | self._train_dataloader.collate_fn = self.collate_fn 295 | 296 | self._train_data = train_dataset 297 | self._per_gpu_train_batch_size = per_gpu_train_batch_size 298 | 299 | set_seed(seed, n_gpu) 300 | 301 | if n_gpu > 1: 302 | self.model = torch.nn.DataParallel(self.model, device_ids=[int(i) for i in visiable_device.split(',')]) 303 | self.model = self.model.to(f'cuda:{self.model.device_ids[0]}') 304 | 305 | elif n_gpu == 1: 306 | self.model = self.model.to(device) 307 | 308 | self._device = device 309 | self._n_gpu = n_gpu 310 | 311 | self._total_train_steps = int(self._steps_per_epoch * epochs) 312 | self._epochs = epochs 313 | 314 | if report_model: 315 | count_params(self.model, print_details=True) 316 | 317 | param_optimizer = list(self.model.named_parameters()) 318 | 319 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 320 | optimizer_grouped_parameters = [ 321 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 322 | 'weight_decay': weight_decay}, 323 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 324 | ] 325 | if local_rank != -1: 326 | self._total_train_steps = self._total_train_steps // torch.distributed.get_world_size() 327 | 328 | self._optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) 329 | 330 | warmup_steps = math.ceil(self._total_train_steps * warmup_ratio) # by default 20% of train data for warm-up 331 | logger.info(f" Warmup-steps: {warmup_steps}") 332 | 333 | self._scheduler = self._get_scheduler(self._optimizer, scheduler=scheduler, warmup_steps=warmup_steps, 334 | num_total=self._total_train_steps) 335 | 336 | if fp16: 337 | try: 338 | from apex import amp 339 | except ImportError: 340 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 341 | 342 | model, optimizer = amp.initialize(self.model, self._optimizer, opt_level=fp16_opt_level) 343 | self.model = model 344 | self._optimizer = optimizer 345 | 346 | self._fp16 = fp16 347 | tb_writer = None 348 | if local_rank in [-1, 0]: 349 | tb_writer = SummaryWriter() 350 | self._tb_writer = tb_writer 351 | self._local_rank = local_rank 352 | self._best_score = -float("inf") 353 | self._early_stop_count = 0 354 | self.last_time = datetime.now() 355 | self.accumulation_steps = accumulation_steps 356 | # assert evaluation_steps % accumulation_steps == 0, "evaluation_steps should be divisable by accumulation_steps" 357 | 358 | def _train_epoch(self, epoch: int, global_steps: int): 359 | epoch_steps = 0 360 | epoch_loss = 0.0 361 | 362 | self.model.zero_grad() 363 | for step, data in enumerate( 364 | tqdm(self._train_dataloader, desc="training", total=self._steps_per_epoch * self.accumulation_steps)): 365 | 366 | self.model.train() 367 | if data["labels"] != "skip-device": 368 | input, labels = batch_to_device(data, self._device) 369 | # add labels to input for training where this step is ignored when inference 370 | if isinstance(labels, dict): 371 | for idx in range(len(input)): 372 | input[idx].update(labels) 373 | else: 374 | input = data["features"] 375 | loss_value, _ = self.model(input) 376 | 377 | if self._n_gpu > 1: 378 | loss_value = loss_value.mean() 379 | if self.accumulation_steps > 1: 380 | loss_value = loss_value / self.accumulation_steps 381 | 382 | if self._fp16: 383 | try: 384 | from apex import amp 385 | except ImportError: 386 | raise ImportError( 387 | "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 388 | with amp.scale_loss(loss_value, self._optimizer) as scaled_loss: 389 | scaled_loss.backward() 390 | torch.nn.utils.clip_grad_norm_(amp.master_params(self._optimizer), self._max_grad_norm) 391 | else: 392 | loss_value.backward() 393 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self._max_grad_norm) 394 | epoch_loss += loss_value 395 | 396 | if (step + 1) % self.accumulation_steps == 0: 397 | 398 | self._optimizer.step() 399 | self._scheduler.step() 400 | self.model.zero_grad() 401 | 402 | epoch_steps += 1 403 | total_global = epoch_steps + global_steps 404 | 405 | if self._evaluation_steps > 0 and (total_global) % self._evaluation_steps == 0: 406 | dev_loss, eval_scores = self._dev_eval_in_training(epoch, epoch_steps) 407 | logger.info(" ***** Evaluation report *****") 408 | logger.info(f" Output path (short): {self.output_path}") 409 | logger.info(f" Early stop on: {self._dev_evaluator.early_stop_on}") 410 | logger.info(f" Early stop count = {self._early_stop_count}/{self.early_stop}") 411 | logger.info( 412 | f" Eval steps = {self._evaluation_steps} or (iterations = {self._evaluation_steps * self.accumulation_steps})") 413 | logger.info(f" Best score ({self._dev_evaluator.early_stop_on}) = {self._best_score}") 414 | logger.info(f" Gradient Accumulation steps = {self.accumulation_steps}") 415 | 416 | logger.info( 417 | f" Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = {len(self._train_data)}") 418 | logger.info( 419 | f" Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = {len(self._dev_data)}") 420 | now_time = datetime.now() 421 | logger.info(f" Time spent since last evaluation = {self.time_diff(self.last_time, now_time)}") 422 | self.last_time = now_time 423 | 424 | logger.info(f" Epoch = {epoch + 1}/{self._epochs}") 425 | logger.info(f" Steps = {total_global}/{self._total_train_steps}") 426 | logger.info( 427 | f" Instantaneous batch size per GPU = {self._per_gpu_train_batch_size} and n_gpu = {self._n_gpu} so the input batch size = {self._train_batch_size}") 428 | if dev_loss != None: 429 | logger.info(f" dev_loss = {dev_loss:.6f}\t||\t dev_eval_scores = {eval_scores}") 430 | else: 431 | logger.info(f" dev_eval_scores = {eval_scores}") 432 | 433 | train_loss = epoch_loss / epoch_steps 434 | logger.info(f" train_loss = {train_loss}") 435 | logger.info("\n********************************************") 436 | 437 | if is_wandb_available() and self.wandb_config != None: 438 | if dev_loss != None: 439 | wandb.log( 440 | {"loss_dev": dev_loss, 441 | f"best_score_for_{self._dev_evaluator.early_stop_on}": self._best_score, 442 | "loss_train": train_loss, "lr": self._scheduler.get_lr()[0]}, 443 | step=total_global) 444 | else: 445 | wandb.log({"loss_train": train_loss, 446 | f"best_score_for_{self._dev_evaluator.early_stop_on}": self._best_score, 447 | "lr": self._scheduler.get_lr()[0]}, 448 | step=total_global) 449 | 450 | for key, value in eval_scores.items(): 451 | if is_wandb_available() and self.wandb_config != None: 452 | wandb.log({f"eval_{key}_dev": value}, step=total_global) 453 | self._tb_writer.add_scalar(f"eval_{key}_dev", value, total_global) 454 | 455 | self._tb_writer.add_scalar("lr", self._scheduler.get_lr()[0], total_global) 456 | if dev_loss != None: 457 | self._tb_writer.add_scalar("loss_dev", dev_loss, total_global) 458 | 459 | self._tb_writer.add_scalar("loss_train", train_loss, total_global) 460 | 461 | if self._early_stop_count >= self.early_stop: 462 | logger.info( 463 | f" Continuous {self.early_stop} evaluation steps without loss reduction, so early stopped...") 464 | sys.exit(0) 465 | 466 | return epoch_loss, epoch_steps 467 | 468 | def train(self): 469 | if self._restore_training: 470 | logger.info(f"***** restoring training from the previous checkpoint: {self.ck_report}*****") 471 | else: 472 | logger.info("***** Running training *****") 473 | logger.info( 474 | f" Num of training examples (actually iterations per epoch for Iterable Dataset) = {len(self._train_data)}") 475 | logger.info(f" Output path (short): {self.output_path}") 476 | logger.info( 477 | f" Steps per Epoch = {self._steps_per_epoch} or iterations per epoch = {self._steps_per_epoch * self.accumulation_steps}") 478 | logger.info(f" Num of Epochs = {self._epochs}") 479 | logger.info(f" Best score ({self._dev_evaluator.early_stop_on}) = {self._best_score}") 480 | logger.info( 481 | f" Eval every {self._evaluation_steps} steps or every {self._evaluation_steps * self.accumulation_steps} iterations") 482 | logger.info(f" Early stop = {self.early_stop}") 483 | logger.info(f" Gradient Accumulation steps = {self.accumulation_steps}") 484 | 485 | logger.info(f" Total optimization steps = {self._total_train_steps}") 486 | logger.info( 487 | f" Instantaneous batch size per GPU = {self._per_gpu_train_batch_size} and n_gpu = {self._n_gpu} so the input batch size = {self._train_batch_size}") 488 | global_loss = 0.0 489 | global_steps = 0 490 | self.last_time = datetime.now() 491 | for epoch in trange(self._epochs, desc="Epoch"): 492 | epoch_loss, epoch_steps = self._train_epoch(epoch, global_steps) 493 | global_loss += epoch_loss 494 | global_steps += epoch_steps 495 | logger.info(f"epoch {epoch + 1} ends, {self._epochs - epoch - 1} epoches left") 496 | logger.info( 497 | f"\nglobal_average_loss={global_loss / global_steps},global_steps={global_steps} on training set") 498 | 499 | if self._local_rank in [-1, 0]: 500 | self._tb_writer.close() 501 | 502 | def _dev_eval_in_training(self, epoch, steps): 503 | return_scores = {} 504 | if self._dev_evaluator is not None: 505 | 506 | return_scores = self._dev_evaluator(self.model, self.collate_fn, 507 | output_path=self.output_path, epoch=epoch, steps=steps) 508 | 509 | early_stop_on = self._dev_evaluator.early_stop_on 510 | 511 | check_score = -return_scores[early_stop_on] if early_stop_on == "loss" or early_stop_on == "perplexity" else \ 512 | return_scores[early_stop_on] 513 | if check_score >= self._best_score and self._save_best_model: 514 | eval_scores_transformed = {key: 515 | return_scores[key].item() if torch.is_tensor(return_scores[key]) else 516 | return_scores[key] 517 | for key in return_scores.keys()} 518 | self.save(self.output_path, 519 | {"training_examples (when pos_num=1 for ranking)": len(self._train_data), 520 | "evaluation_steps": self._evaluation_steps, 521 | "train_batch_size": self._train_batch_size, "epoch": epoch + 1, "total_epochs": self._epochs, 522 | "steps": steps, 523 | "saved_at_total_steps": steps + epoch * self._steps_per_epoch, 524 | "steps_per_epoch": self._steps_per_epoch, "eval_scores_on_dev": eval_scores_transformed}) 525 | 526 | self._best_score = check_score 527 | 528 | logger.info(f" Save check-point at epoch={epoch} step={steps}") 529 | self._early_stop_count = 0 530 | else: 531 | self._early_stop_count += 1 532 | 533 | return return_scores.pop("loss").item() if "loss" in return_scores else None, return_scores 534 | 535 | def save(self, path, eval_details): 536 | if path is None: 537 | return 538 | logger.info(f" Save model to {path}") 539 | contained_modules = [] 540 | 541 | to_iterate = self.model.module._modules if self._n_gpu > 1 else self.model._modules 542 | 543 | for idx, name in enumerate(to_iterate): 544 | module = to_iterate[str(name)] 545 | 546 | model_path = os.path.join(path, str(idx) + "_" + type(module).__name__) 547 | os.makedirs(model_path, exist_ok=True) 548 | module.save(model_path) 549 | contained_modules.append( 550 | {'idx': idx, 'name': name, 'path': os.path.basename(model_path), 'type': type(module).__module__}) 551 | 552 | if self.wandb_config != None: 553 | with open(os.path.join(path, 'hyperparams.json'), 'w') as f: 554 | json.dump(self.wandb_config.__dict__, f, indent=2) 555 | 556 | with open(os.path.join(path, 'modules.json'), 'w') as fOut: 557 | json.dump(contained_modules, fOut, indent=2) 558 | with open(os.path.join(path, 'ck_report.json'), 'w') as fOut: 559 | json.dump(eval_details, fOut, indent=2) 560 | 561 | def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, num_total: int): 562 | assert scheduler in ["constantlr", "warmuplinear", "warmupconstant", "warmupcosine", 563 | "warmupcosinewithhardrestarts"], ( 564 | 'scheduler should be one of ["constantlr","warmupconstant","warmupcosine","warmupcosinewithhardrestarts"]') 565 | if scheduler == 'constantlr': 566 | return transformers.get_constant_schedule(optimizer) 567 | elif scheduler == 'warmupconstant': 568 | return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps) 569 | elif scheduler == 'warmuplinear': 570 | return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, 571 | num_training_steps=num_total) 572 | elif scheduler == 'warmupcosine': 573 | return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, 574 | num_training_steps=num_total) 575 | elif scheduler == 'warmupcosinewithhardrestarts': 576 | return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, 577 | num_warmup_steps=warmup_steps, 578 | num_training_steps=num_total) 579 | 580 | def time_diff(self, t_a, t_b): 581 | t_diff = relativedelta(t_b, t_a) # later/end time comes first! 582 | return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds) 583 | --------------------------------------------------------------------------------