├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── data.py
├── dataset
├── README.md
└── convert.py
├── demo.gif
├── evaluate.py
├── hf_model.png
├── icon.png
├── interact.py
├── model.py
├── requirements.txt
├── train.py
└── trainer.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dataset/source_code/
2 | model/
3 | venv/
4 | .idea/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #  AutoCoder
2 |
3 |
4 |
5 | #### A basic and simple tool for code auto completion, fine-tuned from the pytorch [pre-trained GPT-2 variants](https://huggingface.co/transformers/pretrained_models.html) offered by the awesome [🤗 transformers](https://github.com/huggingface/transformers) library.
6 |
7 | ### Demo
8 | 
9 |
10 | ### [Play on 🤗HF's Model Hub](https://huggingface.co/congcongwang/gpt2_medium_fine_tuned_coder?text=%3Cpython%3E+def+factorial)👇
11 |
12 | 
13 |
14 | ### Features
15 | - Write with Python or Java.
16 |
17 | ### Blog linked to this project
18 | - [The details of dataset construction and fine-tuning process](https://wangcongcong123.github.io/AutoCoder/)
19 |
20 | ### Quick Start
21 | Here provides three ways of quick-start. Before that,
22 |
23 |
24 | #### Load from 🤗transformers models
25 | Now there are [two fine-tuned models](https://huggingface.co/models?search=congcongwang) uploded to 🤗transformers models library. They can be used easily as long as you `pip install transformers`
26 |
27 |
28 | ```python
29 | from transformers import AutoTokenizer,AutoModelWithLMHead
30 | tokenizer = AutoTokenizer.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder")
31 | model = AutoModelWithLMHead.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder")
32 | # or
33 | # tokenizer = AutoTokenizer.from_pretrained("congcongwang/distilgpt2_fine_tuned_coder")
34 | # model = AutoModelWithLMHead.from_pretrained("congcongwang/distilgpt2_fine_tuned_coder")
35 | use_cuda=True
36 | context="def factorial"
37 | lang="python" # can be java as well.
38 |
39 | if use_cuda:
40 | model.to("cuda")
41 |
42 | input_ids = tokenizer.encode(" " + context,
43 | return_tensors='pt') if lang == "python" else tokenizer.encode(
44 | " " + context, return_tensors='pt')
45 | outputs = model.generate(input_ids=input_ids.to("cuda") if use_cuda else input_ids,
46 | max_length=128,
47 | temperature=0.7,
48 | num_return_sequences=1)
49 |
50 | decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
51 | print(decoded)
52 | ```
53 |
54 |
55 |
56 | #### Ready-to-go Interaction
57 | ```
58 | git clone https://github.com/wangcongcong123/auto_coding.git
59 | pip install -r requirements.txt
60 | ```
61 |
62 | 1. Download the fine-tuned models, here are two versions provided.
63 | * [distilgpt2_fine_tuned_coder (params: 82M, size: 291MB)](https://ucdcs-student.ucd.ie/~cwang/autocoder/distilgpt2_fine_tuned_coder.zip)
64 | * [gpt2_medium_fine_tuned_coder.zip (params: 345M, size: 1.22GB)](https://ucdcs-student.ucd.ie/~cwang/autocoder/gpt2_medium_fine_tuned_coder.zip)
65 | 2. Unzip the model and move it to `./model` (create it first)
66 | 3. Run the interact: `python interact.py`
67 |
68 | #### Fine-tuning yours
69 | ```
70 | git clone
71 | pip install -r requirements.txt
72 | ```
73 |
74 | 1. Preparing [the dataset](./dataset)
75 | 2. Start fine-tuning model: `python train.py --model_select distilgpt2`
76 | 3. After fine-tuning, the model will be saved to `./model/distilgpt2_fine_tuned_coder/0_GPTSingleHead` which is exactly the fine-tuned version as provided in Ready-to-go Interaction.
77 |
78 | \* For more params setting of training, `python train.py -h`
79 |
80 | ### Generative examples
81 | Good Python generation examples by fine-tuned GPT2-medium
82 | ```python
83 | -------------Example 1--------------------------------------
84 | Context code: def factorial
85 | Generated:
86 | def factorial(n):
87 | return 1 if n == 1 else n * factorial(n - 1)
88 |
89 | --------------Example 2-------------------------------------
90 | Context code: def sum_of_series(num_of_terms:int, power: int)
91 | Generated:
92 | def sum_of_series(num_of_terms:int, power: int) -> float:
93 | """Calculate Sum of Series.
94 | >>> sum_of_series(1, 1, 10)
95 | 55.0
96 | >>> sum_of_series(1, 10, 100)
97 | 49600.0
98 | """
99 | sum = (num_of_terms / 2) * power
100 | return sum
101 |
102 | --------------Example 3-------------------------------------
103 | Context code: def quick_sort(collection)
104 | Generated:
105 | def quick_sort(collection):
106 | """
107 | >>> quick_sort([0, 5, 3, 2, 2])
108 | [0, 2, 2, 3, 5]
109 | >>> quick_sort([])
110 | []
111 | >>> quick_sort([-2, -5, -45])
112 | [-45, -5, -2]
113 | """
114 | length = len(collection)
115 | if length <= 1:
116 | return collection
117 | else:
118 | # Use the last element as the first pivot
119 | pivot = collection.pop()
120 | # Put elements greater than pivot in greater list
121 | # Put elements lesser than pivot in lesser list
122 | greater, lesser = [], []
123 | for element in collection:
124 | if element > pivot:
125 | ...
126 | ```
127 | Good Java generation examples by fine-tuned GPT2-medium
128 |
129 | ```javascript
130 | --------------Example 1-------------------------------------
131 | Context code: Scanner input= new Scanner(System.in)
132 | Generated:
133 | Scanner input= new Scanner(System.in)
134 | System.out.println("Enter the Octal number: ");
135 | // Take octal number as input from user in a string
136 | String oct = input.next();
137 |
138 | --------------Example 2-------------------------------------
139 | Context code: private static int CountCharacters(String str)
140 | Generated:
141 | private static int CountCharacters(String str) {
142 | return str.replaceAll("\\s", "").length();
143 | }
144 | ```
145 | \* Although some generated examples look good, it needs to take a grain of salt to judge the model's actual performance. The model may simply **"remembers"** existing code in the training set well.
146 |
147 | ### TODO list
148 | - Expand the dataset (and construct the dataset more carefeully) and increase context window. Try larger generative models like GPT-2 large or even [GPT-3 variants](https://arxiv.org/abs/2005.14165) as proposed recently if the computational resources are allowed.
149 | - Remove overlapping between training examples and dev examples for contamination studies. That says, to what extent the model memorizes examples rigidly or [at surface heuristics level during training](https://arxiv.org/pdf/1902.01007.pdf).
150 | - Try some adversarial examples (more complicated for model's reasoning capability testing purpose) to test the robustness of the model.
151 | - Integrate this into real-life use case such as a code editor - [Sublime Text](https://www.sublimetext.com/), where a threshold of joint probability may need to be studied for code snippet recommendations.
152 | - Try some ideas of location-aware code generation. For example, if a human coder is sitting writing a comment, the autocoder should be aware of the coder's context (left and right if available) to help complete the corresponding content.
153 | - Model size and inference efficiency is a problem in real-life use cases.
154 | - Do research in this problem domain to grab a general idea of what work has done in the literature for this particular problem.
155 |
156 |
157 |
158 | ### Extra notes
159 | * For mutli-GPU training, it only works when torch==1.4.0. It will be not working when torch==1.5.0. No idea so far how to fix this issue.
160 |
--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset
2 | import os, pickle, json
3 | import logging
4 |
5 | logger = logging.getLogger(__name__)
6 | from tqdm import tqdm
7 |
8 | class SrcCodeDataset(Dataset):
9 | def __init__(self, file_path, model, cache_path=None):
10 | """
11 | this dataset class is used to load source code dataset in batch for fine-tuning with GPT2LMModel
12 | :param model: the model that the dataset will be fed to
13 | """
14 | self.inputs = []
15 | load_cache = False
16 | if cache_path != None:
17 | load_cache = self._load_cache(cache_path)
18 | if not load_cache:
19 | self._build(file_path, model)
20 | if cache_path != None:
21 | self._cache(cache_path)
22 |
23 | def __len__(self):
24 | return len(self.inputs)
25 |
26 | def __getitem__(self, index):
27 | input_ids = self.inputs[index]["input_ids"]
28 | # input_mask = self.inputs[index]["attention_mask"] we don't need attention_mask for this task
29 | # return {"input_ids": input_ids, "input_mask": input_mask}
30 | return {"input_ids": input_ids}
31 |
32 | def _load_cache(self, cache_path):
33 | load_cache = False
34 | if os.path.isdir(cache_path):
35 | if os.path.isfile(os.path.join(cache_path, "inputs.pk")):
36 | with open(os.path.join(cache_path, "inputs.pk"), "rb") as f:
37 | logger.info(
38 | f" load cached token ids of model from {cache_path}")
39 | self.inputs = pickle.load(f)
40 | load_cache = True
41 | return load_cache
42 |
43 | def _cache(self, cache_path):
44 | if not os.path.isdir(cache_path):
45 | os.makedirs(cache_path)
46 | with open(os.path.join(cache_path, "inputs.pk"), "wb") as f:
47 | pickle.dump(self.inputs, f)
48 | logger.info(
49 | f" save tokenized ids of samples to: {cache_path}/inputs.pk")
50 |
51 | def _build(self, file_path, model):
52 | with open(file_path) as f:
53 | for line in tqdm(f):
54 | example = json.loads(line.strip())
55 | if example["label"].lower() == "python":
56 | encoded_plus = model.tokenizer.encode_plus(
57 | model.tokenize("") + example["token_ids"] + [model.eos_token_id],
58 | max_length=model.max_seq_length)
59 | elif example["label"].lower() == "java":
60 | encoded_plus = model.tokenizer.encode_plus(
61 | model.tokenize("") + example["token_ids"] + [model.eos_token_id],
62 | max_length=model.max_seq_length)
63 | self.inputs.append(encoded_plus.data)
64 |
--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | ### Download directly from [here](https://ucdcs-student.ucd.ie/~cwang/autocoder/source_code.zip)
2 |
3 | unzip the source_code file and move it under this directory.
4 |
5 |
6 | ### Or build dataset from scratch
7 | This allows to customize dataset building. Below is an example of the building process.
8 |
9 | Let's use Python and Java codes from [The Algorithms project](https://github.com/TheAlgorithms) as the dataset. We want AutoCoder to help auto-complete codes at a general level. The codes of The Algorithms suits the need! Another reason is personally thinking this code from this project is well written (high-quality codes!).
10 |
11 | ##### download source code
12 | ```
13 | git clone https://github.com/TheAlgorithms/Python
14 | git clone https://github.com/TheAlgorithms/Java
15 | ```
16 |
17 | ##### Move the dowloaded two folders into here this `dataset/` directory and then run
18 |
19 | ```
20 | python convert.py --segment_len 256 --stride 10 --dev_size 0.1
21 | ```
22 |
23 | You will find a train set named train.jsonl and dev set named dev.jsonl under `source_code/json/`.
24 |
25 | Have a look at the `convert.py` script for the specific process of dataset construction or quickly read [this blog](#).
26 |
27 |
--------------------------------------------------------------------------------
/dataset/convert.py:
--------------------------------------------------------------------------------
1 | import glob, json, os, argparse
2 | from tqdm import tqdm
3 | from sklearn.model_selection import train_test_split
4 | from transformers import GPT2Tokenizer
5 |
6 | if __name__ == '__main__':
7 |
8 | parser = argparse.ArgumentParser(description='Params')
9 | parser.add_argument('--segment_len', type=int, default=254,
10 | help='the length of each example')
11 | # we set this to be 254 instead of 256 because we want the input to be like: input_ids
12 | parser.add_argument('--stride', type=int, default=10,
13 | help='stride to split training examples')
14 | parser.add_argument('--dev_size', type=float, default=0.1,
15 | help='split ratio of development set for each language')
16 | args = parser.parse_args()
17 |
18 | gpt2_tok = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
19 | paths = ['Python', 'Java']
20 | segments = {}
21 |
22 | for path in paths:
23 | source_files = glob.glob(f'{path}/**/*.py' if path == "Python" else f'{path}/**/*.java', recursive=True)
24 | for each_src in tqdm(source_files):
25 | with open(each_src, "r", encoding="utf-8") as f:
26 | code_content = f.read()
27 | encoded = gpt2_tok.encode(code_content)
28 | for i in range(len(encoded) // args.stride):
29 | seg = encoded[i * args.stride:i * args.stride + args.segment_len]
30 | if path not in segments:
31 | segments[path] = []
32 | segments[path].append(json.dumps({"token_ids": seg, "label": path}))
33 |
34 | train, dev = [], []
35 | for key in segments:
36 | # we don't shuffle before splitting because we want the train and dev to be very different (less overlapping)
37 | tr, de = train_test_split(segments[key], test_size=args.dev_size)
38 | train += tr
39 | dev += de
40 |
41 | to_path = "source_code/json"
42 | if not os.path.isdir(to_path):
43 | os.makedirs(to_path)
44 |
45 | with open(os.path.join(to_path, "train.jsonl"), "w") as f:
46 | f.write("\n".join(train))
47 |
48 | with open(os.path.join(to_path, "dev.jsonl"), "w") as f:
49 | f.write("\n".join(dev))
50 |
--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/demo.gif
--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict
2 | import torch
3 | import logging
4 | from torch.utils.data import DataLoader
5 | from tqdm import tqdm
6 |
7 | logging.basicConfig(
8 | format=logging.BASIC_FORMAT,
9 | datefmt='%Y-%m-%d %H:%M:%S',
10 | level=logging.INFO
11 | )
12 | logger = logging.getLogger(__name__)
13 |
14 | class SingleCLMEvaluator():
15 | def __init__(self, dataloader: DataLoader = None,
16 | data_tag: str = "dev",
17 | device: int = None, tokenizer=None, early_stop_on: str = "perplexity"):
18 |
19 | if data_tag not in ["dev", "train", "test"]:
20 | raise ValueError("data_tag has to be one of dev, train or test")
21 | assert early_stop_on in ["loss", "perplexity"]
22 | self.early_stop_on = early_stop_on
23 | self.dataloader = dataloader
24 | self.data_tag = data_tag
25 | self.tokenizer = tokenizer
26 |
27 | self.n_gpu = torch.cuda.device_count()
28 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29 | if device == -1:
30 | self.n_gpu = 0
31 | self.device = torch.device("cpu")
32 |
33 | def reset_dataloader(self, dataloader: DataLoader):
34 | self.dataloader = dataloader
35 |
36 | def reset_logger(self, output_path):
37 | pass
38 |
39 | def __call__(self, model, collate_fn, output_path: str = None, epoch: int = -1, steps: int = -1,
40 | target_names: List[str] = None, do_predict: bool = False) -> Dict[
41 | str, float]:
42 |
43 | if do_predict and self.tokenizer == None:
44 | raise ValueError("you are doing predict so need a tokenizer")
45 | if self.dataloader is None:
46 | raise ValueError(" need to set dataloader for this evaluator, call reset_dataloader()")
47 |
48 | model.eval()
49 | if epoch == -1 and steps == -1:
50 | logger.info(
51 | f"\nEvaluation the model on {self.data_tag} dataset")
52 | else:
53 | logger.info(
54 | "\nEvaluation the model on " + self.data_tag + " dataset" + f" in epoch {epoch} after {steps} steps:")
55 |
56 | self.dataloader.collate_fn = collate_fn
57 | total_loss = 0.0
58 | total_steps = 0
59 |
60 | for step, batch in enumerate(tqdm(self.dataloader, desc="evaluating")):
61 | input = batch["features"]
62 | # batch to device
63 | for feature_name, ids in input.items():
64 | input[feature_name] = ids.to(self.device)
65 |
66 | with torch.no_grad():
67 | loss, logits = model(input)
68 | loss = loss.mean()
69 | total_loss += loss
70 |
71 | total_steps += 1
72 | eval_loss = total_loss / total_steps
73 | eval_results = {"loss": eval_loss}
74 |
75 | perplexity = torch.exp(torch.tensor(eval_loss)).clone().detach()
76 | eval_results["perplexity"] = perplexity.mean().item()
77 | return eval_results
78 |
--------------------------------------------------------------------------------
/hf_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/hf_model.png
--------------------------------------------------------------------------------
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/icon.png
--------------------------------------------------------------------------------
/interact.py:
--------------------------------------------------------------------------------
1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
2 | import argparse
3 |
4 | if __name__ == '__main__':
5 |
6 | parser = argparse.ArgumentParser(description='Params')
7 | parser.add_argument('--model_path', type=str, default="model/gpt2_medium_fine_tuned_coder",
8 | help='the path to load fine-tuned model')
9 | parser.add_argument('--max_length', type=int, default=128,
10 | help='maximum length for code generation')
11 | parser.add_argument('--temperature', type=float, default=0.7,
12 | help='temperature for sampling-based code geneeration')
13 | parser.add_argument(
14 | "--use_cuda", action="store_true", help="inference with gpu?"
15 | )
16 |
17 | args = parser.parse_args()
18 |
19 | # load fine-tunned model and tokenizer from path
20 | model = GPT2LMHeadModel.from_pretrained(args.model_path)
21 | tokenizer = GPT2Tokenizer.from_pretrained(args.model_path)
22 |
23 | model.eval()
24 | if args.use_cuda:
25 | model.to("cuda")
26 |
27 | # now the fine-tunned model supports two programming languages, namely, python and java
28 | def lang_select():
29 | lang = ""
30 | while lang not in ["python", "java"]:
31 | print('Enter the programming language you prefer (python or java)')
32 | lang = input(">>> ").lower()
33 | return lang
34 |
35 |
36 | lang = lang_select()
37 |
38 | context = ""
39 | while context != "exit":
40 | print(f'You are using {lang} now. Enter the context code (exit or change_lang)')
41 | context = input(">>> ")
42 |
43 | if context == "change_lang":
44 | lang = lang_select()
45 |
46 | print(f"You are using {lang} now. Enter the context code")
47 | context = input(">>> ")
48 |
49 | input_ids = tokenizer.encode(" " + context,
50 | return_tensors='pt') if lang == "python" else tokenizer.encode(
51 | " " + context, return_tensors='pt')
52 | outputs = model.generate(input_ids=input_ids.to("cuda") if args.use_cuda else input_ids,
53 | max_length=args.max_length,
54 | temperature=args.temperature,
55 | num_return_sequences=1)
56 | for i in range(1):
57 | decoded = tokenizer.decode(outputs[i], skip_special_tokens=True)
58 | # ends with occurence of double new lines (to meet the convention of code completion)
59 | if "\n\n" in decoded:
60 | decoded = decoded[:decoded.index("\n\n")]
61 |
62 | print('Generated {}: {}'.format(i, decoded))
63 |
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
2 | import json
3 | from typing import Dict
4 | import os
5 | import torch
6 | from torch import nn, Tensor
7 |
8 | import logging
9 | logging.basicConfig(
10 | format=logging.BASIC_FORMAT,
11 | datefmt='%Y-%m-%d %H:%M:%S',
12 | level=logging.INFO
13 | )
14 | logger = logging.getLogger(__name__)
15 |
16 | class GPTSingleHead(nn.Module):
17 | """
18 | Different from directly using GPT2LMHeadModel, this wraps up GPT2LMHeadModel as well as GPT2Tokenizer
19 | """
20 | def __init__(self, model_name_or_path: str, max_seq_length: int = 256, do_lower_case: bool = False,
21 | special_words_to_add=None):
22 | super(GPTSingleHead, self).__init__()
23 | self.config_keys = ['max_seq_length', 'do_lower_case']
24 | self.do_lower_case = do_lower_case
25 | if max_seq_length > 1024:
26 | logging.warning(
27 | "GPT only allows a max_seq_length of 1024. Value will be set to 1024")
28 | max_seq_length = 1024
29 | self.max_seq_length = max_seq_length
30 | self.gpt = GPT2LMHeadModel.from_pretrained(model_name_or_path)
31 | self.tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
32 | if special_words_to_add != None:
33 | self.add_special_words(special_words_to_add)
34 |
35 | self.bos_token_id=self.tokenizer.bos_token_id
36 | self.eos_token_id=self.tokenizer.eos_token_id
37 | # self.pad_token_id=self.tokenizer.pad_token_id
38 |
39 | def tokenize(self, text: str): # default for cls
40 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
41 |
42 | def add_special_words(self, special_words_to_add):
43 | orig_num_tokens = len(self.tokenizer)
44 | num_added_tokens = self.tokenizer.add_special_tokens(special_words_to_add)
45 | if num_added_tokens > 0:
46 | self.gpt.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
47 |
48 | def forward(self, input: Dict[str, torch.Tensor]):
49 | loss, logits=self.gpt(input["input_ids"],labels=input["input_ids"])[:2]
50 | return loss, logits
51 |
52 | def get_config_dict(self):
53 | return {key: self.__dict__[key] for key in self.config_keys}
54 |
55 | def padding_features(self, features_dict_list):
56 | """
57 | padding features for a batch
58 | :param features_dict_list: i.e., batch
59 | :return: padded batch features
60 | """
61 | max_input_len_this_batch = 0
62 |
63 | batch_features = {feature_name: [] for feature_name in features_dict_list[0]}
64 | for feature_dict in features_dict_list:
65 | for feature_name, feature_ids in feature_dict.items():
66 | if feature_name == "input_ids" and len(feature_ids) > max_input_len_this_batch:
67 | max_input_len_this_batch = len(feature_ids)
68 | batch_features[feature_name].append(feature_ids)
69 |
70 | padded_batch_features = {feature_name: [] for feature_name in features_dict_list[0]}
71 | for feature_name, batch_ids in batch_features.items():
72 |
73 | for each_ids in batch_ids:
74 | padded = each_ids + [self.tokenizer.pad_token_id] * (max_input_len_this_batch - len(each_ids))
75 | padded_batch_features[feature_name].append(padded)
76 |
77 | for feature_name, ids in padded_batch_features.items():
78 | padded_batch_features[feature_name] = torch.tensor(ids)
79 |
80 | return padded_batch_features
81 |
82 | def get_embedding_dimension(self) -> int:
83 | return self.gpt.config.hidden_size
84 |
85 | def get_config(self) -> int:
86 | return self.gpt.config
87 |
88 | def save(self, output_path: str):
89 | self.gpt.save_pretrained(output_path)
90 | self.tokenizer.save_pretrained(output_path)
91 | with open(os.path.join(output_path, 'gpt_sh_config.json'), 'w') as f:
92 | json.dump(self.get_config_dict(), f, indent=2)
93 |
94 | def reload(self, input_path: str):
95 | """reload from checkpoint weights"""
96 | return GPTSingleHead.load(input_path + "/0_GPTSingleHead")
97 |
98 | @staticmethod
99 | def load(input_path: str):
100 | if not os.path.isfile(os.path.join(input_path, 'gpt_sh_config.json')):
101 | raise ValueError("In the model path does not find gpt_sh_config.json file, you may have not trained yet")
102 | with open(os.path.join(input_path, 'gpt_sh_config.json')) as f:
103 | config = json.load(f)
104 | return GPTSingleHead(model_name_or_path=input_path, **config)
105 |
106 |
107 | class EmptyHeads(nn.Module):
108 | def __init__(self):
109 | self.config_keys=[]
110 | super().__init__()
111 |
112 | def forward(self, input: Dict[str, Tensor]):
113 | return input
114 |
115 | def get_config_dict(self):
116 | return {key: self.__dict__[key] for key in self.config_keys}
117 |
118 | def save(self, output_path):
119 | with open(os.path.join(output_path, 'empty_heads_config.json'), 'w') as f:
120 | json.dump(self.get_config_dict(), f, indent=2)
121 | torch.save(self.state_dict(), os.path.join(output_path, 'empty_heads.pt'))
122 |
123 | def load_saved(self, input_path):
124 | self.load_state_dict(torch.load(os.path.join(input_path, '1_EmptyHeads', 'empty_heads.pt')))
125 |
126 | @staticmethod
127 | def load(input_path,config):
128 | if not os.path.isfile(os.path.join(input_path, 'empty_heads_config.json')):
129 | raise ValueError(
130 | "In the model path does not find empty_heads_config.json file, you may have not trained yet")
131 |
132 | with open(os.path.join(input_path, 'empty_heads_config.json')) as f:
133 | config = json.load(f)
134 | model = EmptyHeads()
135 |
136 | if not os.path.isfile(os.path.join(input_path, 'empty_heads.pt')):
137 | raise ValueError("In the model path does not find state of file, you need to train and get weights first")
138 |
139 | model.load_state_dict(torch.load(os.path.join(input_path, 'empty_heads.pt')))
140 | return model
141 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | sklearn
3 | transformers
4 | torch==1.4.0
5 | numpy
6 | wandb
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import argparse, os
2 | import logging
3 |
4 | logging.basicConfig(
5 | format=logging.BASIC_FORMAT,
6 | datefmt='%Y-%m-%d %H:%M:%S',
7 | level=logging.INFO
8 | )
9 | logger = logging.getLogger(__name__)
10 |
11 | MODEL_MAP = {"distilgpt2": "distilgpt2", "gpt2": "gpt2", "gpt2_medium": "gpt2-medium",
12 | "gpt2_large": "gpt2-large"}
13 |
14 | from model import GPTSingleHead
15 | from trainer import ModelTrainer
16 | from data import SrcCodeDataset
17 | from evaluate import SingleCLMEvaluator
18 |
19 | if __name__ == '__main__':
20 | parser = argparse.ArgumentParser(description='Hyper params')
21 | parser.add_argument('--model_select', type=str, default="distilgpt2",
22 | help='model select from distilgpt2, gpt2_medium, gpt2, or gpt2_large')
23 | parser.add_argument('--dataset_name', type=str, default="source_code",
24 | help='dataset name whatever name you put into the ./dataset directory (by default: source_code)')
25 | parser.add_argument('--per_gpu_train_batch_size', type=int, default=4,
26 | help='input batch size for training')
27 | parser.add_argument('--dev_batch_size', type=int, default=8,
28 | help='input batch size for development')
29 | parser.add_argument('--num_epochs_train', type=int, default=16,
30 | help='number of epochs to train')
31 | parser.add_argument('--max_seq_length', type=int, default=256,
32 | help='maximum sequence length of samples in a batch for training')
33 | parser.add_argument('--lr', type=float, default=2e-5,
34 | help='learning rate')
35 | parser.add_argument('--warmup_ratio', type=float, default=0.2,
36 | help='warmup_ratio')
37 | parser.add_argument('--early_stop', type=int, default=20,
38 | help='early_stop')
39 | parser.add_argument('--scheduler', type=str, default="warmuplinear",
40 | help='scheduler')
41 | parser.add_argument('--seed', type=int, default=122,
42 | help='random seed')
43 | parser.add_argument('--accumulation_steps', type=int, default=1,
44 | help='accumulation steps if you want large batch size but can not fit in the memory allowed')
45 | parser.add_argument('--n_gpu', type=int, default=1,
46 | help='number of gpu for training')
47 | parser.add_argument('--visiable_device', type=str, default="0",
48 | help='visiable gpus for training, should be consistent with n_gpu')
49 | parser.add_argument('--evaluation_steps', type=int, default=200,
50 | help='evaluation_steps')
51 | parser.add_argument('--wandb_project_name', type=str, default="code_generate",
52 | help='project name for wandb')
53 | parser.add_argument(
54 | "--restore_training", action="store_true", help="restore training if a saved checkopint exists"
55 | )
56 | parser.add_argument(
57 | "--with_wandb", action="store_true", help="Train with wandb tracking."
58 | )
59 |
60 | args = parser.parse_args()
61 | logger.info(f"args: {args}")
62 | dataset_folder = f"dataset/{args.dataset_name}/json/"
63 | assert args.model_select in MODEL_MAP.keys(), (f"model has to be in {MODEL_MAP.keys()}")
64 | output_path = f"model/{args.model_select}_fine_tuned_coder"
65 | logger.info("{} for dataset in: {}".format(output_path, dataset_folder))
66 | logger.info(
67 | f"*****************model select: {args.model_select} for code generation using dataset: {args.dataset_name}******************")
68 | # add more params for wandb
69 | args.wandb_run_name = output_path
70 | #initialize model by model name (the same as used in transformers lib)
71 | model = GPTSingleHead(MODEL_MAP[args.model_select], max_seq_length=args.max_seq_length)
72 | #add special tokens for controlling code generation by different programming language
73 | model.add_special_words({"pad_token": "", "additional_special_tokens": ["", ""]})
74 | #load training dataset
75 | file_path = dataset_folder + "train.jsonl"
76 | train_dataset = SrcCodeDataset(file_path, model, cache_path=os.path.join(".cache", output_path, "train"))
77 | #load developlemt dataset
78 | file_path = dataset_folder + "dev.jsonl"
79 | dev_dataset = SrcCodeDataset(file_path, model, cache_path=os.path.join(".cache", output_path, "dev"))
80 | # initialize development evaluator
81 | dev_evaluator = SingleCLMEvaluator()
82 | # initialize model trainer
83 | model_trainer = ModelTrainer(model,
84 | train_dataset=train_dataset,
85 | dev_dataset=dev_dataset,
86 | dev_evaluator=dev_evaluator,
87 | scheduler=args.scheduler,
88 | epochs=args.num_epochs_train,
89 | per_gpu_train_batch_size=args.per_gpu_train_batch_size,
90 | output_path=output_path,
91 | optimizer_params={'lr': args.lr, 'eps': 1e-6, 'correct_bias': False},
92 | evaluation_steps=args.evaluation_steps,
93 | early_stop=args.early_stop,
94 | dev_batch_size=args.dev_batch_size,
95 | restore_training=args.restore_training,
96 | accumulation_steps=args.accumulation_steps,
97 | n_gpu=args.n_gpu,
98 | visiable_device=args.visiable_device,
99 | warmup_ratio=args.warmup_ratio,
100 | seed=args.seed,
101 | data_loader_shuffle=True,
102 | wandb_config=args if args.with_wandb else None)
103 | #start training
104 | model_trainer.train()
105 |
--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | import os
4 | import shutil
5 | import sys
6 | from typing import Type, Dict
7 |
8 | import torch
9 | import transformers
10 |
11 | try:
12 | from torch.utils.tensorboard import SummaryWriter
13 | except ImportError:
14 | from tensorboardX import SummaryWriter
15 | from torch import nn
16 | from torch.utils.data import DataLoader
17 | from torch.utils import data
18 | from torch.optim.optimizer import Optimizer
19 | from tqdm import trange, tqdm
20 |
21 | from dateutil.relativedelta import relativedelta
22 |
23 | import random
24 | import numpy as np
25 | import logging
26 | from model import EmptyHeads
27 |
28 | logging.basicConfig(
29 | format=logging.BASIC_FORMAT,
30 | datefmt='%Y-%m-%d %H:%M:%S',
31 | level=logging.INFO
32 | )
33 | logger = logging.getLogger(__name__)
34 |
35 | from datetime import datetime
36 |
37 | try:
38 | import wandb
39 |
40 | wandb.ensure_configured()
41 | if wandb.api.api_key is None:
42 | _has_wandb = False
43 | wandb.termwarn("W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.")
44 | else:
45 | _has_wandb = False if os.getenv("WANDB_DISABLED") else True
46 | except ImportError:
47 | _has_wandb = False
48 |
49 |
50 | def set_seed(seed, n_gpu):
51 | logger.info(f" see seed for random, numpy and torch {seed}")
52 | random.seed(seed)
53 | np.random.seed(seed)
54 | torch.manual_seed(seed)
55 | if n_gpu > 0:
56 | torch.cuda.manual_seed_all(seed)
57 |
58 |
59 | def print_model_state_dict(model):
60 | for param_tensor in model.state_dict():
61 | logger.info(f"{param_tensor}\t{model.state_dict()[param_tensor].size()}")
62 |
63 |
64 | def print_optimizer_state_dict(optimizer):
65 | for var_name in optimizer.state_dict():
66 | logger.info(f"{var_name}\t{optimizer.state_dict()[var_name]}")
67 |
68 |
69 | def count_params(model: torch.nn.Module, print_details: bool = False):
70 | trainable_count = 0
71 | total_count = 0
72 | if isinstance(model, torch.nn.Sequential):
73 | for index in model._modules:
74 | if print_details:
75 | print_model_state_dict(model._modules[index])
76 | logger.info(model._modules[index])
77 | trainable_count += sum(p.numel() for p in model._modules[index].parameters() if p.requires_grad)
78 | total_count += sum(p.numel() for p in model._modules[index].parameters())
79 | else:
80 | if print_details:
81 | print_model_state_dict(model)
82 | logger.info(model)
83 | total_count = sum(p.numel() for p in model.parameters())
84 | trainable_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
85 | logger.info(f' Total params: {total_count}')
86 | logger.info(f' Trainable params: {trainable_count}')
87 | logger.info(f' Non-trainable params: {total_count - trainable_count}')
88 |
89 |
90 | def batch_to_device(batch, device, keep_label=False):
91 | features = batch['features']
92 | if isinstance(features, dict):
93 | for feature_name in features:
94 | features[feature_name] = features[feature_name].to(device)
95 | else:
96 | for inx in range(len(features)):
97 | for feature_name in features[inx]:
98 | features[inx][feature_name] = features[inx][feature_name].to(device)
99 |
100 | label_space = batch['labels']
101 | if label_space == None: # for tasks like lm, labels are none.
102 | return features, None
103 | if not keep_label:
104 | labels = {"label_space_" + str(inx): label_space[inx].to(device) if torch.is_tensor(label_space[inx]) else
105 | label_space[inx] for inx in range(len(label_space))}
106 | else:
107 | labels = label_space
108 | return features, labels
109 |
110 |
111 | def is_wandb_available():
112 | return _has_wandb
113 |
114 |
115 | class CollateFunction():
116 | def __init__(self, up_model):
117 | self.up_model = up_model
118 |
119 | def __call__(self, batch):
120 | if isinstance(batch[0], dict):
121 | padded_features = self.up_model.padding_features(batch)
122 | return {'features': padded_features,
123 | "labels": None} # label_ids are in features, this task does not need labels, we set
124 |
125 |
126 | class ModelTrainer():
127 | def __init__(self, up_model: nn.Module, down_layer: nn.Module = None, train_dataset=None,
128 | dev_dataset=None, dev_evaluator=None,
129 | epochs: int = 1,
130 | visiable_device: str = "0",
131 | scheduler: str = 'warmuplinear',
132 | warmup_ratio: float = 0.1,
133 | optimizer_class: Type[Optimizer] = transformers.AdamW,
134 | optimizer_params: Dict[str, object] = {'lr': 5e-5, 'eps': 1e-6, 'correct_bias': False},
135 | weight_decay: float = 0.01,
136 | early_stop: int = 20,
137 | # 20 evaluation steps without improving on the early_stop_on metric as specified in dev_evaluator
138 | evaluation_steps: int = 500,
139 | output_path: str = None,
140 | save_best_model: bool = True,
141 | max_grad_norm: float = 1,
142 | fp16: bool = False,
143 | accumulation_steps=1,
144 | fp16_opt_level: str = 'O1',
145 | seed: int = 122,
146 | data_loader_shuffle=True,
147 | device: str = None,
148 | dev_batch_size: int = -1, # the same as train_batch_size
149 | n_gpu: int = None,
150 | report_model: bool = True,
151 | per_gpu_train_batch_size: int = 8,
152 | restore_training: bool = False,
153 | local_rank: int = -1,
154 | wandb_config=None):
155 | """
156 | this trainer is written for training a sequential model that contains an upstream_layer (usually transformers)
157 | and a downstream_layer (usually task-specific heads like FF, RNN, CNN for encoding the output of upstram_layer)
158 |
159 | :param up_model: transformers like transformers.GPT2LMHeadModel or transformers.BERTModel
160 | :param down_layer: None if up_model already wraps up with an output encoder such as LMHead in GPT2LMHeadModel, else nn.Module for encoding the output of up_model
161 | :param train_dataset: train_dataset, it can be either instance of torch.data.Dataset or IterableDataset (defined in data.py)
162 | :param dev_dataset: dev_dataset, it can be either instance of torch.data.Dataset or IterableDataset
163 | :param dev_evaluator: dev_evaluator, evaluator on dev_dataset for early stop and performance tracking during training (defined in evaluate.py)
164 | :param epochs: number of epoches for training
165 | :param visiable_device: devices chosen to perform training
166 | :param scheduler: scheduler specially from transformers: see options in self._get_scheduler
167 | :param warmup_ratio: warmup_ratio ratio for learning rate over total training steps
168 | :param optimizer_class: transformers.AdamW de byfault
169 | :param optimizer_params: optimizer params
170 | :param weight_decay:weight decay
171 | :param early_stop:early stop steps
172 | :param evaluation_steps:logging steps
173 | :param output_path: path to save the checkpoint with the best performance as specified in early_stop_on in dev_evaluator instance
174 | :param save_best_model:save best checkpoint or the latest checkpoint
175 | :param max_grad_norm:max grad norm
176 | :param fp16: fp16 training
177 | :param accumulation_steps:accumulation steps
178 | :param fp16_opt_level:fp16 opt level
179 | :param seed:random seed for reproducibility
180 | :param data_loader_shuffle:Whether to shuffle data_loader of training dataset and dev dataset after epoch ends
181 | :param device: device for training, None or gpu for gpu training, cpu for gpu training
182 | :param dev_batch_size: development batch size, usually larger than training batch size due to no grads calculation and hence less burden on memory
183 | :param n_gpu: number of gpus for training
184 | :param report_model:if report model's structure and number of trainable params in logging
185 | :param per_gpu_train_batch_size: what it means literally
186 | :param restore_training: if restore training if the training process is interupped due to some accidents
187 | :param local_rank:for distributed training
188 | :param wandb_config: wandb logging if not none, else without wandb logging
189 | """
190 |
191 | self.up_model = up_model
192 | if down_layer == None:
193 | # In this example, the upstream_layer already integrate the downstream head (namely, simple LM head as in transformers.GPT2LMHeadModel)
194 | # EmptyHeads is created here only for placeholder purpose
195 | down_layer = EmptyHeads()
196 |
197 | self.down_layer = down_layer
198 | assert output_path != None
199 | output_path = os.path.join("tmp", output_path)
200 | # os.makedirs(output_path,exist_ok=True)
201 | if restore_training:
202 | if not os.listdir(output_path):
203 | raise ValueError(f"no checkpoint found in {output_path}")
204 | else:
205 | logger.info(" loading embedding weights from saved checkpoint")
206 | self.up_model = self.up_model.reload(
207 | output_path) # for other transformers (apart from bert), the load_saved function has not been added
208 |
209 | logger.info(" loading downstream weights from saved checkpoint")
210 | self.down_layer.load_saved(output_path)
211 | with open(output_path + "/ck_report.json") as f:
212 | self.ck_report = json.load(f)
213 |
214 | self.model = torch.nn.Sequential(self.up_model, self.down_layer)
215 |
216 | if is_wandb_available() and wandb_config != None:
217 | # keep track of model topology and gradients if is_wandb_available and args!=None
218 | wandb.init(project=wandb_config.wandb_project_name, config=wandb_config, name=wandb_config.wandb_run_name)
219 | wandb.watch(
220 | (self.up_model, self.down_layer), log_freq=max(100, evaluation_steps)
221 | )
222 | self.wandb_config = wandb_config
223 |
224 | self._restore_training = restore_training
225 | self.early_stop = early_stop
226 |
227 | self._dev_evaluator = dev_evaluator
228 |
229 | self._evaluation_steps = evaluation_steps
230 | self._save_best_model = save_best_model
231 | self._max_grad_norm = max_grad_norm
232 |
233 | os.makedirs(output_path, exist_ok=True)
234 | if os.listdir(output_path) and not restore_training:
235 | out = input(
236 | "Output directory ({}) already exists and is not empty, you wanna remove it before start? (y/n)".format(
237 | output_path))
238 | if out == "y":
239 | shutil.rmtree(output_path)
240 | os.makedirs(output_path, exist_ok=True)
241 | else:
242 | raise ValueError("Output directory ({}) already exists and is not empty".format(
243 | output_path))
244 |
245 | logFormatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
246 | fileHandler = logging.FileHandler(os.path.join(output_path, "log.out"), mode="a")
247 | fileHandler.setFormatter(logFormatter)
248 | logger.addHandler(fileHandler)
249 | self._dev_evaluator.reset_logger(output_path)
250 |
251 | self.output_path = output_path
252 |
253 | if device is None or device == "cuda":
254 | if torch.cuda.is_available():
255 | device = torch.device("cuda")
256 | n_gpu = 1 if n_gpu == 1 else torch.cuda.device_count()
257 | else:
258 | logger.warning("no cuda is found in your machine, now use cpu")
259 | device = torch.device("cpu")
260 | n_gpu = 0
261 | elif device == "cpu":
262 | device = torch.device("cpu")
263 | n_gpu = 0
264 | else:
265 | raise ValueError("set device to be None, cuda or cpu")
266 | assert n_gpu <= torch.cuda.device_count()
267 |
268 | logger.info("Use pytorch device: {}, with gpu_number={}".format(device, n_gpu))
269 |
270 | self._train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)
271 | self._dev_batch_size = dev_batch_size if dev_batch_size != -1 else self._train_batch_size
272 |
273 | if isinstance(train_dataset, data.IterableDataset):
274 | self._train_dataloader = DataLoader(train_dataset, batch_size=None)
275 | self._steps_per_epoch = len(self._train_dataloader.dataset)
276 | else:
277 | self._train_dataloader = DataLoader(train_dataset, shuffle=data_loader_shuffle,
278 | batch_size=self._train_batch_size)
279 | self._steps_per_epoch = len(self._train_dataloader)
280 |
281 | if isinstance(dev_dataset, data.IterableDataset):
282 | dev_dataloader = DataLoader(dev_dataset, batch_size=None)
283 | else:
284 | dev_dataloader = DataLoader(dev_dataset, shuffle=data_loader_shuffle, batch_size=self._dev_batch_size)
285 |
286 | if accumulation_steps > 1:
287 | self._steps_per_epoch = self._steps_per_epoch // accumulation_steps
288 |
289 | self._dev_data = dev_dataset
290 | self._dev_evaluator.reset_dataloader(dev_dataloader)
291 |
292 | self.collate_fn = CollateFunction(self.up_model)
293 | # Use customize batching
294 | self._train_dataloader.collate_fn = self.collate_fn
295 |
296 | self._train_data = train_dataset
297 | self._per_gpu_train_batch_size = per_gpu_train_batch_size
298 |
299 | set_seed(seed, n_gpu)
300 |
301 | if n_gpu > 1:
302 | self.model = torch.nn.DataParallel(self.model, device_ids=[int(i) for i in visiable_device.split(',')])
303 | self.model = self.model.to(f'cuda:{self.model.device_ids[0]}')
304 |
305 | elif n_gpu == 1:
306 | self.model = self.model.to(device)
307 |
308 | self._device = device
309 | self._n_gpu = n_gpu
310 |
311 | self._total_train_steps = int(self._steps_per_epoch * epochs)
312 | self._epochs = epochs
313 |
314 | if report_model:
315 | count_params(self.model, print_details=True)
316 |
317 | param_optimizer = list(self.model.named_parameters())
318 |
319 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
320 | optimizer_grouped_parameters = [
321 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
322 | 'weight_decay': weight_decay},
323 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
324 | ]
325 | if local_rank != -1:
326 | self._total_train_steps = self._total_train_steps // torch.distributed.get_world_size()
327 |
328 | self._optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
329 |
330 | warmup_steps = math.ceil(self._total_train_steps * warmup_ratio) # by default 20% of train data for warm-up
331 | logger.info(f" Warmup-steps: {warmup_steps}")
332 |
333 | self._scheduler = self._get_scheduler(self._optimizer, scheduler=scheduler, warmup_steps=warmup_steps,
334 | num_total=self._total_train_steps)
335 |
336 | if fp16:
337 | try:
338 | from apex import amp
339 | except ImportError:
340 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
341 |
342 | model, optimizer = amp.initialize(self.model, self._optimizer, opt_level=fp16_opt_level)
343 | self.model = model
344 | self._optimizer = optimizer
345 |
346 | self._fp16 = fp16
347 | tb_writer = None
348 | if local_rank in [-1, 0]:
349 | tb_writer = SummaryWriter()
350 | self._tb_writer = tb_writer
351 | self._local_rank = local_rank
352 | self._best_score = -float("inf")
353 | self._early_stop_count = 0
354 | self.last_time = datetime.now()
355 | self.accumulation_steps = accumulation_steps
356 | # assert evaluation_steps % accumulation_steps == 0, "evaluation_steps should be divisable by accumulation_steps"
357 |
358 | def _train_epoch(self, epoch: int, global_steps: int):
359 | epoch_steps = 0
360 | epoch_loss = 0.0
361 |
362 | self.model.zero_grad()
363 | for step, data in enumerate(
364 | tqdm(self._train_dataloader, desc="training", total=self._steps_per_epoch * self.accumulation_steps)):
365 |
366 | self.model.train()
367 | if data["labels"] != "skip-device":
368 | input, labels = batch_to_device(data, self._device)
369 | # add labels to input for training where this step is ignored when inference
370 | if isinstance(labels, dict):
371 | for idx in range(len(input)):
372 | input[idx].update(labels)
373 | else:
374 | input = data["features"]
375 | loss_value, _ = self.model(input)
376 |
377 | if self._n_gpu > 1:
378 | loss_value = loss_value.mean()
379 | if self.accumulation_steps > 1:
380 | loss_value = loss_value / self.accumulation_steps
381 |
382 | if self._fp16:
383 | try:
384 | from apex import amp
385 | except ImportError:
386 | raise ImportError(
387 | "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
388 | with amp.scale_loss(loss_value, self._optimizer) as scaled_loss:
389 | scaled_loss.backward()
390 | torch.nn.utils.clip_grad_norm_(amp.master_params(self._optimizer), self._max_grad_norm)
391 | else:
392 | loss_value.backward()
393 | torch.nn.utils.clip_grad_norm_(self.model.parameters(), self._max_grad_norm)
394 | epoch_loss += loss_value
395 |
396 | if (step + 1) % self.accumulation_steps == 0:
397 |
398 | self._optimizer.step()
399 | self._scheduler.step()
400 | self.model.zero_grad()
401 |
402 | epoch_steps += 1
403 | total_global = epoch_steps + global_steps
404 |
405 | if self._evaluation_steps > 0 and (total_global) % self._evaluation_steps == 0:
406 | dev_loss, eval_scores = self._dev_eval_in_training(epoch, epoch_steps)
407 | logger.info(" ***** Evaluation report *****")
408 | logger.info(f" Output path (short): {self.output_path}")
409 | logger.info(f" Early stop on: {self._dev_evaluator.early_stop_on}")
410 | logger.info(f" Early stop count = {self._early_stop_count}/{self.early_stop}")
411 | logger.info(
412 | f" Eval steps = {self._evaluation_steps} or (iterations = {self._evaluation_steps * self.accumulation_steps})")
413 | logger.info(f" Best score ({self._dev_evaluator.early_stop_on}) = {self._best_score}")
414 | logger.info(f" Gradient Accumulation steps = {self.accumulation_steps}")
415 |
416 | logger.info(
417 | f" Num of training examples (actually no. of iterations per epoch for Iterable Dataset) = {len(self._train_data)}")
418 | logger.info(
419 | f" Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = {len(self._dev_data)}")
420 | now_time = datetime.now()
421 | logger.info(f" Time spent since last evaluation = {self.time_diff(self.last_time, now_time)}")
422 | self.last_time = now_time
423 |
424 | logger.info(f" Epoch = {epoch + 1}/{self._epochs}")
425 | logger.info(f" Steps = {total_global}/{self._total_train_steps}")
426 | logger.info(
427 | f" Instantaneous batch size per GPU = {self._per_gpu_train_batch_size} and n_gpu = {self._n_gpu} so the input batch size = {self._train_batch_size}")
428 | if dev_loss != None:
429 | logger.info(f" dev_loss = {dev_loss:.6f}\t||\t dev_eval_scores = {eval_scores}")
430 | else:
431 | logger.info(f" dev_eval_scores = {eval_scores}")
432 |
433 | train_loss = epoch_loss / epoch_steps
434 | logger.info(f" train_loss = {train_loss}")
435 | logger.info("\n********************************************")
436 |
437 | if is_wandb_available() and self.wandb_config != None:
438 | if dev_loss != None:
439 | wandb.log(
440 | {"loss_dev": dev_loss,
441 | f"best_score_for_{self._dev_evaluator.early_stop_on}": self._best_score,
442 | "loss_train": train_loss, "lr": self._scheduler.get_lr()[0]},
443 | step=total_global)
444 | else:
445 | wandb.log({"loss_train": train_loss,
446 | f"best_score_for_{self._dev_evaluator.early_stop_on}": self._best_score,
447 | "lr": self._scheduler.get_lr()[0]},
448 | step=total_global)
449 |
450 | for key, value in eval_scores.items():
451 | if is_wandb_available() and self.wandb_config != None:
452 | wandb.log({f"eval_{key}_dev": value}, step=total_global)
453 | self._tb_writer.add_scalar(f"eval_{key}_dev", value, total_global)
454 |
455 | self._tb_writer.add_scalar("lr", self._scheduler.get_lr()[0], total_global)
456 | if dev_loss != None:
457 | self._tb_writer.add_scalar("loss_dev", dev_loss, total_global)
458 |
459 | self._tb_writer.add_scalar("loss_train", train_loss, total_global)
460 |
461 | if self._early_stop_count >= self.early_stop:
462 | logger.info(
463 | f" Continuous {self.early_stop} evaluation steps without loss reduction, so early stopped...")
464 | sys.exit(0)
465 |
466 | return epoch_loss, epoch_steps
467 |
468 | def train(self):
469 | if self._restore_training:
470 | logger.info(f"***** restoring training from the previous checkpoint: {self.ck_report}*****")
471 | else:
472 | logger.info("***** Running training *****")
473 | logger.info(
474 | f" Num of training examples (actually iterations per epoch for Iterable Dataset) = {len(self._train_data)}")
475 | logger.info(f" Output path (short): {self.output_path}")
476 | logger.info(
477 | f" Steps per Epoch = {self._steps_per_epoch} or iterations per epoch = {self._steps_per_epoch * self.accumulation_steps}")
478 | logger.info(f" Num of Epochs = {self._epochs}")
479 | logger.info(f" Best score ({self._dev_evaluator.early_stop_on}) = {self._best_score}")
480 | logger.info(
481 | f" Eval every {self._evaluation_steps} steps or every {self._evaluation_steps * self.accumulation_steps} iterations")
482 | logger.info(f" Early stop = {self.early_stop}")
483 | logger.info(f" Gradient Accumulation steps = {self.accumulation_steps}")
484 |
485 | logger.info(f" Total optimization steps = {self._total_train_steps}")
486 | logger.info(
487 | f" Instantaneous batch size per GPU = {self._per_gpu_train_batch_size} and n_gpu = {self._n_gpu} so the input batch size = {self._train_batch_size}")
488 | global_loss = 0.0
489 | global_steps = 0
490 | self.last_time = datetime.now()
491 | for epoch in trange(self._epochs, desc="Epoch"):
492 | epoch_loss, epoch_steps = self._train_epoch(epoch, global_steps)
493 | global_loss += epoch_loss
494 | global_steps += epoch_steps
495 | logger.info(f"epoch {epoch + 1} ends, {self._epochs - epoch - 1} epoches left")
496 | logger.info(
497 | f"\nglobal_average_loss={global_loss / global_steps},global_steps={global_steps} on training set")
498 |
499 | if self._local_rank in [-1, 0]:
500 | self._tb_writer.close()
501 |
502 | def _dev_eval_in_training(self, epoch, steps):
503 | return_scores = {}
504 | if self._dev_evaluator is not None:
505 |
506 | return_scores = self._dev_evaluator(self.model, self.collate_fn,
507 | output_path=self.output_path, epoch=epoch, steps=steps)
508 |
509 | early_stop_on = self._dev_evaluator.early_stop_on
510 |
511 | check_score = -return_scores[early_stop_on] if early_stop_on == "loss" or early_stop_on == "perplexity" else \
512 | return_scores[early_stop_on]
513 | if check_score >= self._best_score and self._save_best_model:
514 | eval_scores_transformed = {key:
515 | return_scores[key].item() if torch.is_tensor(return_scores[key]) else
516 | return_scores[key]
517 | for key in return_scores.keys()}
518 | self.save(self.output_path,
519 | {"training_examples (when pos_num=1 for ranking)": len(self._train_data),
520 | "evaluation_steps": self._evaluation_steps,
521 | "train_batch_size": self._train_batch_size, "epoch": epoch + 1, "total_epochs": self._epochs,
522 | "steps": steps,
523 | "saved_at_total_steps": steps + epoch * self._steps_per_epoch,
524 | "steps_per_epoch": self._steps_per_epoch, "eval_scores_on_dev": eval_scores_transformed})
525 |
526 | self._best_score = check_score
527 |
528 | logger.info(f" Save check-point at epoch={epoch} step={steps}")
529 | self._early_stop_count = 0
530 | else:
531 | self._early_stop_count += 1
532 |
533 | return return_scores.pop("loss").item() if "loss" in return_scores else None, return_scores
534 |
535 | def save(self, path, eval_details):
536 | if path is None:
537 | return
538 | logger.info(f" Save model to {path}")
539 | contained_modules = []
540 |
541 | to_iterate = self.model.module._modules if self._n_gpu > 1 else self.model._modules
542 |
543 | for idx, name in enumerate(to_iterate):
544 | module = to_iterate[str(name)]
545 |
546 | model_path = os.path.join(path, str(idx) + "_" + type(module).__name__)
547 | os.makedirs(model_path, exist_ok=True)
548 | module.save(model_path)
549 | contained_modules.append(
550 | {'idx': idx, 'name': name, 'path': os.path.basename(model_path), 'type': type(module).__module__})
551 |
552 | if self.wandb_config != None:
553 | with open(os.path.join(path, 'hyperparams.json'), 'w') as f:
554 | json.dump(self.wandb_config.__dict__, f, indent=2)
555 |
556 | with open(os.path.join(path, 'modules.json'), 'w') as fOut:
557 | json.dump(contained_modules, fOut, indent=2)
558 | with open(os.path.join(path, 'ck_report.json'), 'w') as fOut:
559 | json.dump(eval_details, fOut, indent=2)
560 |
561 | def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, num_total: int):
562 | assert scheduler in ["constantlr", "warmuplinear", "warmupconstant", "warmupcosine",
563 | "warmupcosinewithhardrestarts"], (
564 | 'scheduler should be one of ["constantlr","warmupconstant","warmupcosine","warmupcosinewithhardrestarts"]')
565 | if scheduler == 'constantlr':
566 | return transformers.get_constant_schedule(optimizer)
567 | elif scheduler == 'warmupconstant':
568 | return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
569 | elif scheduler == 'warmuplinear':
570 | return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
571 | num_training_steps=num_total)
572 | elif scheduler == 'warmupcosine':
573 | return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
574 | num_training_steps=num_total)
575 | elif scheduler == 'warmupcosinewithhardrestarts':
576 | return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,
577 | num_warmup_steps=warmup_steps,
578 | num_training_steps=num_total)
579 |
580 | def time_diff(self, t_a, t_b):
581 | t_diff = relativedelta(t_b, t_a) # later/end time comes first!
582 | return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)
583 |
--------------------------------------------------------------------------------