├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── data.py
├── dataset
    ├── README.md
    └── convert.py
├── demo.gif
├── evaluate.py
├── hf_model.png
├── icon.png
├── interact.py
├── model.py
├── requirements.txt
├── train.py
└── trainer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dataset/source_code/
2 | model/
3 | venv/
4 | .idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ![](icon.png) AutoCoder
  2 | 
  3 | <a href="/flairNLP/flair/blob/master/CONTRIBUTING.md"><img src="https://camo.githubusercontent.com/8f697c48adc5026cc6d83dd45e42b9b93ee1803c/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f636f6e747269627574696f6e732d77656c636f6d652d627269676874677265656e2e737667" alt="Contributions welcome" data-canonical-src="https://img.shields.io/badge/contributions-welcome-brightgreen.svg" style="max-width:100%;"></a> <a href="https://opensource.org/licenses/apache" rel="nofollow"></a>
  4 | 
  5 | #### A basic and simple tool for code auto completion, fine-tuned from the pytorch [pre-trained GPT-2 variants](https://huggingface.co/transformers/pretrained_models.html) offered by the awesome [🤗 transformers](https://github.com/huggingface/transformers) library.
  6 | 
  7 | ### Demo
  8 | ![demo](demo.gif)
  9 | 
 10 | ### [Play on 🤗HF's Model Hub](https://huggingface.co/congcongwang/gpt2_medium_fine_tuned_coder?text=%3Cpython%3E+def+factorial)👇
 11 | 
 12 | ![](hf_model.png)
 13 | 
 14 | ### Features
 15 | - Write with Python or Java.
 16 | 
 17 | ### Blog linked to this project
 18 | - [The details of dataset construction and fine-tuning process](https://wangcongcong123.github.io/AutoCoder/)
 19 | 
 20 | ### Quick Start
 21 | Here provides three ways of quick-start. Before that,
 22 | 
 23 | 
 24 | #### Load from 🤗transformers models 
 25 | Now there are [two fine-tuned models](https://huggingface.co/models?search=congcongwang) uploded to 🤗transformers models library. They can be used easily as long as you `pip install transformers`
 26 | 
 27 | 
 28 | ```python
 29 | from transformers import AutoTokenizer,AutoModelWithLMHead
 30 | tokenizer = AutoTokenizer.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder")
 31 | model = AutoModelWithLMHead.from_pretrained("congcongwang/gpt2_medium_fine_tuned_coder")
 32 | # or
 33 | # tokenizer = AutoTokenizer.from_pretrained("congcongwang/distilgpt2_fine_tuned_coder")
 34 | # model = AutoModelWithLMHead.from_pretrained("congcongwang/distilgpt2_fine_tuned_coder")
 35 | use_cuda=True
 36 | context="def factorial"
 37 | lang="python" # can be java as well.
 38 | 
 39 | if use_cuda:
 40 |     model.to("cuda")
 41 | 
 42 | input_ids = tokenizer.encode("<python> " + context,
 43 |                                      return_tensors='pt') if lang == "python" else tokenizer.encode(
 44 |             "<java> " + context, return_tensors='pt')
 45 | outputs = model.generate(input_ids=input_ids.to("cuda") if use_cuda else input_ids,
 46 |                          max_length=128,
 47 |                          temperature=0.7,
 48 |                          num_return_sequences=1)
 49 | 
 50 | decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
 51 | print(decoded)
 52 | ```
 53 | 
 54 | 
 55 | 
 56 | #### Ready-to-go Interaction
 57 | ```
 58 | git clone https://github.com/wangcongcong123/auto_coding.git
 59 | pip install -r requirements.txt
 60 | ```
 61 | 
 62 | 1. Download the fine-tuned models, here are two versions provided.
 63 |     * [distilgpt2_fine_tuned_coder (params: 82M, size: 291MB)](https://ucdcs-student.ucd.ie/~cwang/autocoder/distilgpt2_fine_tuned_coder.zip)
 64 |     * [gpt2_medium_fine_tuned_coder.zip (params: 345M, size: 1.22GB)](https://ucdcs-student.ucd.ie/~cwang/autocoder/gpt2_medium_fine_tuned_coder.zip)
 65 | 2. Unzip the model and move it to `./model` (create it first)
 66 | 3. Run the interact: `python interact.py`
 67 | 
 68 | #### Fine-tuning yours
 69 | ```
 70 | git clone <this repository>
 71 | pip install -r requirements.txt
 72 | ```
 73 | 
 74 | 1. Preparing [the dataset](./dataset)
 75 | 2. Start fine-tuning model: `python train.py --model_select distilgpt2` 
 76 | 3. After fine-tuning, the model will be saved to `./model/distilgpt2_fine_tuned_coder/0_GPTSingleHead` which is exactly the fine-tuned version as provided in Ready-to-go Interaction.
 77 | 
 78 | \* For more params setting of training, `python train.py -h`
 79 | 
 80 | ### Generative examples
 81 | Good Python generation examples by fine-tuned GPT2-medium
 82 | ```python
 83 | -------------Example 1--------------------------------------
 84 | Context code: def factorial
 85 | Generated: 
 86 | def factorial(n):
 87 |     return 1 if n == 1 else n * factorial(n - 1)
 88 | 
 89 | --------------Example 2-------------------------------------
 90 | Context code: def sum_of_series(num_of_terms:int, power: int)
 91 | Generated:
 92 | def sum_of_series(num_of_terms:int, power: int) -> float:
 93 |     """Calculate Sum of Series.
 94 |     >>> sum_of_series(1, 1, 10)
 95 |     55.0
 96 |     >>> sum_of_series(1, 10, 100)
 97 |     49600.0
 98 |     """
 99 |     sum = (num_of_terms / 2) * power
100 |     return sum
101 | 
102 | --------------Example 3-------------------------------------
103 | Context code: def quick_sort(collection)
104 | Generated:
105 | def quick_sort(collection):
106 |     """
107 |     >>> quick_sort([0, 5, 3, 2, 2])
108 |     [0, 2, 2, 3, 5]
109 |     >>> quick_sort([])
110 |     []
111 |     >>> quick_sort([-2, -5, -45])
112 |     [-45, -5, -2]
113 |     """
114 |     length = len(collection)
115 |     if length <= 1:
116 |         return collection
117 |     else:
118 |         # Use the last element as the first pivot
119 |         pivot = collection.pop()
120 |         # Put elements greater than pivot in greater list
121 |         # Put elements lesser than pivot in lesser list
122 |         greater, lesser = [], []
123 |         for element in collection:
124 |             if element > pivot:
125 |                 ...
126 | ```
127 | Good Java generation examples by fine-tuned GPT2-medium
128 | 
129 | ```javascript
130 | --------------Example 1-------------------------------------
131 | Context code: Scanner input= new Scanner(System.in)
132 | Generated:
133 | Scanner input= new Scanner(System.in)
134 | System.out.println("Enter the Octal number: ");
135 | // Take octal number as input from user in a string
136 | String oct = input.next();
137 | 
138 | --------------Example 2-------------------------------------
139 | Context code: private static int CountCharacters(String str)
140 | Generated:
141 | private static int CountCharacters(String str) {
142 |         return str.replaceAll("\\s", "").length();
143 | }
144 | ```
145 | \* Although some generated examples look good, it needs to take a grain of salt to judge the model's actual performance. The model may simply **"remembers"** existing code in the training set well.
146 | 
147 | ### TODO list
148 | - Expand the dataset (and construct the dataset more carefeully) and increase context window. Try larger generative models like GPT-2 large or even [GPT-3 variants](https://arxiv.org/abs/2005.14165) as proposed recently if the computational resources are allowed.
149 | - Remove overlapping between training examples and dev examples for contamination studies. That says, to what extent the model memorizes examples rigidly or [at surface heuristics level during training](https://arxiv.org/pdf/1902.01007.pdf).
150 | - Try some adversarial examples (more complicated for model's reasoning capability testing purpose) to test the robustness of the model.
151 | - Integrate this into real-life use case such as a code editor - [Sublime Text](https://www.sublimetext.com/), where a threshold of joint probability may need to be studied for code snippet recommendations.
152 | - Try some ideas of location-aware code generation. For example, if a human coder is sitting writing a comment, the autocoder should be aware of the coder's context (left and right if available) to help complete the corresponding content.
153 | - Model size and inference efficiency is a problem in real-life use cases.
154 | - Do research in this problem domain to grab a general idea of what work has done in the literature for this particular problem.
155 | 
156 | 
157 | 
158 | ### Extra notes
159 | * For mutli-GPU training, it only works when torch==1.4.0. It will be not working when torch==1.5.0. No idea so far how to fix this issue.
160 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import os, pickle, json
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | from tqdm import tqdm
 7 | 
 8 | class SrcCodeDataset(Dataset):
 9 |     def __init__(self, file_path, model, cache_path=None):
10 |         """
11 |         this dataset class is used to load source code dataset in batch for fine-tuning with GPT2LMModel
12 |         :param model: the model that the dataset will be fed to
13 |         """
14 |         self.inputs = []
15 |         load_cache = False
16 |         if cache_path != None:
17 |             load_cache = self._load_cache(cache_path)
18 |         if not load_cache:
19 |             self._build(file_path, model)
20 |         if cache_path != None:
21 |             self._cache(cache_path)
22 | 
23 |     def __len__(self):
24 |         return len(self.inputs)
25 | 
26 |     def __getitem__(self, index):
27 |         input_ids = self.inputs[index]["input_ids"]
28 |         # input_mask = self.inputs[index]["attention_mask"] we don't need attention_mask for this task
29 |         # return {"input_ids": input_ids, "input_mask": input_mask}
30 |         return {"input_ids": input_ids}
31 | 
32 |     def _load_cache(self, cache_path):
33 |         load_cache = False
34 |         if os.path.isdir(cache_path):
35 |             if os.path.isfile(os.path.join(cache_path, "inputs.pk")):
36 |                 with open(os.path.join(cache_path, "inputs.pk"), "rb") as f:
37 |                     logger.info(
38 |                         f"  load cached token ids of model from {cache_path}")
39 |                     self.inputs = pickle.load(f)
40 |                     load_cache = True
41 |         return load_cache
42 | 
43 |     def _cache(self, cache_path):
44 |         if not os.path.isdir(cache_path):
45 |             os.makedirs(cache_path)
46 |         with open(os.path.join(cache_path, "inputs.pk"), "wb") as f:
47 |             pickle.dump(self.inputs, f)
48 |             logger.info(
49 |                 f"  save tokenized ids of samples to: {cache_path}/inputs.pk")
50 | 
51 |     def _build(self, file_path, model):
52 |         with open(file_path) as f:
53 |             for line in tqdm(f):
54 |                 example = json.loads(line.strip())
55 |                 if example["label"].lower() == "python":
56 |                     encoded_plus = model.tokenizer.encode_plus(
57 |                         model.tokenize("<python>") + example["token_ids"] + [model.eos_token_id],
58 |                         max_length=model.max_seq_length)
59 |                 elif example["label"].lower() == "java":
60 |                     encoded_plus = model.tokenizer.encode_plus(
61 |                         model.tokenize("<java>") + example["token_ids"] + [model.eos_token_id],
62 |                         max_length=model.max_seq_length)
63 |                 self.inputs.append(encoded_plus.data)
64 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
 1 | ### Download directly from [here](https://ucdcs-student.ucd.ie/~cwang/autocoder/source_code.zip)
 2 | 
 3 | unzip the source_code file and move it under this directory.
 4 | 
 5 | 
 6 | ### Or build dataset from scratch
 7 | This allows to customize dataset building. Below is an example of the building process.
 8 | 
 9 | Let's use Python and Java codes from [The Algorithms project](https://github.com/TheAlgorithms) as the dataset. We want AutoCoder to help auto-complete codes at a general level. The codes of The Algorithms suits the need! Another reason is personally thinking this code from this project is well written (high-quality codes!).
10 | 
11 | ##### download source code
12 | ```
13 | git clone https://github.com/TheAlgorithms/Python
14 | git clone https://github.com/TheAlgorithms/Java
15 | ```
16 | 
17 | ##### Move the dowloaded two folders into here this `dataset/` directory and then run
18 | 
19 | ```
20 | python convert.py --segment_len 256 --stride 10 --dev_size 0.1
21 | ```
22 | 
23 | You will find a train set named train.jsonl and dev set named dev.jsonl under `source_code/json/`.
24 | 
25 | Have a look at the `convert.py` script for the specific process of dataset construction or quickly read [this blog](#). 
26 | 
27 | 


--------------------------------------------------------------------------------
/dataset/convert.py:
--------------------------------------------------------------------------------
 1 | import glob, json, os, argparse
 2 | from tqdm import tqdm
 3 | from sklearn.model_selection import train_test_split
 4 | from transformers import GPT2Tokenizer
 5 | 
 6 | if __name__ == '__main__':
 7 | 
 8 |     parser = argparse.ArgumentParser(description='Params')
 9 |     parser.add_argument('--segment_len', type=int, default=254,
10 |                         help='the length of each example')
11 |     # we set this to be 254 instead of 256 because we want the input to be like: <control_code> input_ids <eos>
12 |     parser.add_argument('--stride', type=int, default=10,
13 |                         help='stride to split training examples')
14 |     parser.add_argument('--dev_size', type=float, default=0.1,
15 |                         help='split ratio of development set for each language')
16 |     args = parser.parse_args()
17 | 
18 |     gpt2_tok = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
19 |     paths = ['Python', 'Java']
20 |     segments = {}
21 | 
22 |     for path in paths:
23 |         source_files = glob.glob(f'{path}/**/*.py' if path == "Python" else f'{path}/**/*.java', recursive=True)
24 |         for each_src in tqdm(source_files):
25 |             with open(each_src, "r", encoding="utf-8") as f:
26 |                 code_content = f.read()
27 |                 encoded = gpt2_tok.encode(code_content)
28 |                 for i in range(len(encoded) // args.stride):
29 |                     seg = encoded[i * args.stride:i * args.stride + args.segment_len]
30 |                     if path not in segments:
31 |                         segments[path] = []
32 |                     segments[path].append(json.dumps({"token_ids": seg, "label": path}))
33 | 
34 |     train, dev = [], []
35 |     for key in segments:
36 |         # we don't shuffle before splitting because we want the train and dev to be very different (less overlapping)
37 |         tr, de = train_test_split(segments[key], test_size=args.dev_size)
38 |         train += tr
39 |         dev += de
40 | 
41 |     to_path = "source_code/json"
42 |     if not os.path.isdir(to_path):
43 |         os.makedirs(to_path)
44 | 
45 |     with open(os.path.join(to_path, "train.jsonl"), "w") as f:
46 |         f.write("\n".join(train))
47 | 
48 |     with open(os.path.join(to_path, "dev.jsonl"), "w") as f:
49 |         f.write("\n".join(dev))
50 | 


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/demo.gif


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | import torch
 3 | import logging
 4 | from torch.utils.data import DataLoader
 5 | from tqdm import tqdm
 6 | 
 7 | logging.basicConfig(
 8 |     format=logging.BASIC_FORMAT,
 9 |     datefmt='%Y-%m-%d %H:%M:%S',
10 |     level=logging.INFO
11 | )
12 | logger = logging.getLogger(__name__)
13 | 
14 | class SingleCLMEvaluator():
15 |     def __init__(self, dataloader: DataLoader = None,
16 |                  data_tag: str = "dev",
17 |                  device: int = None, tokenizer=None, early_stop_on: str = "perplexity"):
18 | 
19 |         if data_tag not in ["dev", "train", "test"]:
20 |             raise ValueError("data_tag has to be one of dev, train or test")
21 |         assert early_stop_on in ["loss", "perplexity"]
22 |         self.early_stop_on = early_stop_on
23 |         self.dataloader = dataloader
24 |         self.data_tag = data_tag
25 |         self.tokenizer = tokenizer
26 | 
27 |         self.n_gpu = torch.cuda.device_count()
28 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29 |         if device == -1:
30 |             self.n_gpu = 0
31 |             self.device = torch.device("cpu")
32 | 
33 |     def reset_dataloader(self, dataloader: DataLoader):
34 |         self.dataloader = dataloader
35 | 
36 |     def reset_logger(self, output_path):
37 |         pass
38 | 
39 |     def __call__(self, model, collate_fn, output_path: str = None, epoch: int = -1, steps: int = -1,
40 |                  target_names: List[str] = None, do_predict: bool = False) -> Dict[
41 |         str, float]:
42 | 
43 |         if do_predict and self.tokenizer == None:
44 |             raise ValueError("you are doing predict so need a tokenizer")
45 |         if self.dataloader is None:
46 |             raise ValueError(" need to set dataloader for this evaluator, call reset_dataloader()")
47 | 
48 |         model.eval()
49 |         if epoch == -1 and steps == -1:
50 |             logger.info(
51 |                 f"\nEvaluation the model on {self.data_tag} dataset")
52 |         else:
53 |             logger.info(
54 |                 "\nEvaluation the model on " + self.data_tag + " dataset" + f" in epoch {epoch} after {steps} steps:")
55 | 
56 |         self.dataloader.collate_fn = collate_fn
57 |         total_loss = 0.0
58 |         total_steps = 0
59 | 
60 |         for step, batch in enumerate(tqdm(self.dataloader, desc="evaluating")):
61 |             input = batch["features"]
62 |             # batch to device
63 |             for feature_name, ids in input.items():
64 |                 input[feature_name] = ids.to(self.device)
65 | 
66 |             with torch.no_grad():
67 |                 loss, logits = model(input)
68 |                 loss = loss.mean()
69 |                 total_loss += loss
70 | 
71 |             total_steps += 1
72 |         eval_loss = total_loss / total_steps
73 |         eval_results = {"loss": eval_loss}
74 | 
75 |         perplexity = torch.exp(torch.tensor(eval_loss)).clone().detach()
76 |         eval_results["perplexity"] = perplexity.mean().item()
77 |         return eval_results
78 | 


--------------------------------------------------------------------------------
/hf_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/hf_model.png


--------------------------------------------------------------------------------
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangcongcong123/auto_coding/e6a96df85dc993c1e4de0d248743d098f4f62530/icon.png


--------------------------------------------------------------------------------
/interact.py:
--------------------------------------------------------------------------------
 1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
 2 | import argparse
 3 | 
 4 | if __name__ == '__main__':
 5 | 
 6 |     parser = argparse.ArgumentParser(description='Params')
 7 |     parser.add_argument('--model_path', type=str, default="model/gpt2_medium_fine_tuned_coder",
 8 |                         help='the path to load fine-tuned model')
 9 |     parser.add_argument('--max_length', type=int, default=128,
10 |                         help='maximum length for code generation')
11 |     parser.add_argument('--temperature', type=float, default=0.7,
12 |                         help='temperature for sampling-based code geneeration')
13 |     parser.add_argument(
14 |         "--use_cuda", action="store_true", help="inference with gpu?"
15 |     )
16 | 
17 |     args = parser.parse_args()
18 | 
19 |     # load fine-tunned model and tokenizer from path
20 |     model = GPT2LMHeadModel.from_pretrained(args.model_path)
21 |     tokenizer = GPT2Tokenizer.from_pretrained(args.model_path)
22 | 
23 |     model.eval()
24 |     if args.use_cuda:
25 |         model.to("cuda")
26 | 
27 |     # now the fine-tunned model supports two programming languages, namely, python and java
28 |     def lang_select():
29 |         lang = ""
30 |         while lang not in ["python", "java"]:
31 |             print('Enter the programming language you prefer (python or java)')
32 |             lang = input(">>> ").lower()
33 |         return lang
34 | 
35 | 
36 |     lang = lang_select()
37 | 
38 |     context = ""
39 |     while context != "exit":
40 |         print(f'You are using {lang} now. Enter the context code (exit or change_lang)')
41 |         context = input(">>> ")
42 | 
43 |         if context == "change_lang":
44 |             lang = lang_select()
45 | 
46 |             print(f"You are using {lang} now. Enter the context code")
47 |             context = input(">>> ")
48 | 
49 |         input_ids = tokenizer.encode("<python> " + context,
50 |                                      return_tensors='pt') if lang == "python" else tokenizer.encode(
51 |             "<java> " + context, return_tensors='pt')
52 |         outputs = model.generate(input_ids=input_ids.to("cuda") if args.use_cuda else input_ids,
53 |                                  max_length=args.max_length,
54 |                                  temperature=args.temperature,
55 |                                  num_return_sequences=1)
56 |         for i in range(1):
57 |             decoded = tokenizer.decode(outputs[i], skip_special_tokens=True)
58 |             # ends with occurence of double new lines (to meet the convention of code completion)
59 |             if "\n\n" in decoded:
60 |                 decoded = decoded[:decoded.index("\n\n")]
61 | 
62 |             print('Generated {}: {}'.format(i, decoded))
63 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
  2 | import json
  3 | from typing import Dict
  4 | import os
  5 | import torch
  6 | from torch import nn, Tensor
  7 | 
  8 | import logging
  9 | logging.basicConfig(
 10 |     format=logging.BASIC_FORMAT,
 11 |     datefmt='%Y-%m-%d %H:%M:%S',
 12 |     level=logging.INFO
 13 | )
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | class GPTSingleHead(nn.Module):
 17 |     """
 18 |     Different from directly using GPT2LMHeadModel, this wraps up GPT2LMHeadModel as well as GPT2Tokenizer
 19 |     """
 20 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 256, do_lower_case: bool = False,
 21 |                  special_words_to_add=None):
 22 |         super(GPTSingleHead, self).__init__()
 23 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 24 |         self.do_lower_case = do_lower_case
 25 |         if max_seq_length > 1024:
 26 |             logging.warning(
 27 |                 "GPT only allows a max_seq_length of 1024. Value will be set to 1024")
 28 |             max_seq_length = 1024
 29 |         self.max_seq_length = max_seq_length
 30 |         self.gpt = GPT2LMHeadModel.from_pretrained(model_name_or_path)
 31 |         self.tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 32 |         if special_words_to_add != None:
 33 |             self.add_special_words(special_words_to_add)
 34 | 
 35 |         self.bos_token_id=self.tokenizer.bos_token_id
 36 |         self.eos_token_id=self.tokenizer.eos_token_id
 37 |         # self.pad_token_id=self.tokenizer.pad_token_id
 38 | 
 39 |     def tokenize(self, text: str):  # default for cls
 40 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 41 | 
 42 |     def add_special_words(self, special_words_to_add):
 43 |         orig_num_tokens = len(self.tokenizer)
 44 |         num_added_tokens = self.tokenizer.add_special_tokens(special_words_to_add)
 45 |         if num_added_tokens > 0:
 46 |             self.gpt.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)
 47 | 
 48 |     def forward(self, input: Dict[str, torch.Tensor]):
 49 |         loss, logits=self.gpt(input["input_ids"],labels=input["input_ids"])[:2]
 50 |         return loss, logits
 51 | 
 52 |     def get_config_dict(self):
 53 |         return {key: self.__dict__[key] for key in self.config_keys}
 54 | 
 55 |     def padding_features(self, features_dict_list):
 56 |         """
 57 |         padding features for a batch
 58 |         :param features_dict_list: i.e., batch
 59 |         :return: padded batch features
 60 |         """
 61 |         max_input_len_this_batch = 0
 62 | 
 63 |         batch_features = {feature_name: [] for feature_name in features_dict_list[0]}
 64 |         for feature_dict in features_dict_list:
 65 |             for feature_name, feature_ids in feature_dict.items():
 66 |                 if feature_name == "input_ids" and len(feature_ids) > max_input_len_this_batch:
 67 |                     max_input_len_this_batch = len(feature_ids)
 68 |                 batch_features[feature_name].append(feature_ids)
 69 | 
 70 |         padded_batch_features = {feature_name: [] for feature_name in features_dict_list[0]}
 71 |         for feature_name, batch_ids in batch_features.items():
 72 | 
 73 |             for each_ids in batch_ids:
 74 |                 padded = each_ids + [self.tokenizer.pad_token_id] * (max_input_len_this_batch - len(each_ids))
 75 |                 padded_batch_features[feature_name].append(padded)
 76 | 
 77 |         for feature_name, ids in padded_batch_features.items():
 78 |             padded_batch_features[feature_name] = torch.tensor(ids)
 79 | 
 80 |         return padded_batch_features
 81 | 
 82 |     def get_embedding_dimension(self) -> int:
 83 |         return self.gpt.config.hidden_size
 84 | 
 85 |     def get_config(self) -> int:
 86 |         return self.gpt.config
 87 | 
 88 |     def save(self, output_path: str):
 89 |         self.gpt.save_pretrained(output_path)
 90 |         self.tokenizer.save_pretrained(output_path)
 91 |         with open(os.path.join(output_path, 'gpt_sh_config.json'), 'w') as f:
 92 |             json.dump(self.get_config_dict(), f, indent=2)
 93 | 
 94 |     def reload(self, input_path: str):
 95 |         """reload from checkpoint weights"""
 96 |         return GPTSingleHead.load(input_path + "/0_GPTSingleHead")
 97 | 
 98 |     @staticmethod
 99 |     def load(input_path: str):
100 |         if not os.path.isfile(os.path.join(input_path, 'gpt_sh_config.json')):
101 |             raise ValueError("In the model path does not find gpt_sh_config.json file, you may have not trained yet")
102 |         with open(os.path.join(input_path, 'gpt_sh_config.json')) as f:
103 |             config = json.load(f)
104 |         return GPTSingleHead(model_name_or_path=input_path, **config)
105 | 
106 | 
107 | class EmptyHeads(nn.Module):
108 |     def __init__(self):
109 |         self.config_keys=[]
110 |         super().__init__()
111 | 
112 |     def forward(self, input: Dict[str, Tensor]):
113 |         return input
114 | 
115 |     def get_config_dict(self):
116 |         return {key: self.__dict__[key] for key in self.config_keys}
117 | 
118 |     def save(self, output_path):
119 |         with open(os.path.join(output_path, 'empty_heads_config.json'), 'w') as f:
120 |             json.dump(self.get_config_dict(), f, indent=2)
121 |         torch.save(self.state_dict(), os.path.join(output_path, 'empty_heads.pt'))
122 | 
123 |     def load_saved(self, input_path):
124 |         self.load_state_dict(torch.load(os.path.join(input_path, '1_EmptyHeads', 'empty_heads.pt')))
125 | 
126 |     @staticmethod
127 |     def load(input_path,config):
128 |         if not os.path.isfile(os.path.join(input_path, 'empty_heads_config.json')):
129 |             raise ValueError(
130 |                 "In the model path does not find empty_heads_config.json file, you may have not trained yet")
131 | 
132 |         with open(os.path.join(input_path, 'empty_heads_config.json')) as f:
133 |             config = json.load(f)
134 |         model = EmptyHeads()
135 | 
136 |         if not os.path.isfile(os.path.join(input_path, 'empty_heads.pt')):
137 |             raise ValueError("In the model path does not find state of file, you need to train and get weights first")
138 | 
139 |         model.load_state_dict(torch.load(os.path.join(input_path, 'empty_heads.pt')))
140 |         return model
141 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | sklearn
3 | transformers
4 | torch==1.4.0
5 | numpy
6 | wandb


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse, os
  2 | import logging
  3 | 
  4 | logging.basicConfig(
  5 |     format=logging.BASIC_FORMAT,
  6 |     datefmt='%Y-%m-%d %H:%M:%S',
  7 |     level=logging.INFO
  8 | )
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | MODEL_MAP = {"distilgpt2": "distilgpt2", "gpt2": "gpt2", "gpt2_medium": "gpt2-medium",
 12 |              "gpt2_large": "gpt2-large"}
 13 | 
 14 | from model import GPTSingleHead
 15 | from trainer import ModelTrainer
 16 | from data import SrcCodeDataset
 17 | from evaluate import SingleCLMEvaluator
 18 | 
 19 | if __name__ == '__main__':
 20 |     parser = argparse.ArgumentParser(description='Hyper params')
 21 |     parser.add_argument('--model_select', type=str, default="distilgpt2",
 22 |                         help='model select from distilgpt2, gpt2_medium, gpt2, or gpt2_large')
 23 |     parser.add_argument('--dataset_name', type=str, default="source_code",
 24 |                         help='dataset name whatever name you put into the ./dataset directory (by default: source_code)')
 25 |     parser.add_argument('--per_gpu_train_batch_size', type=int, default=4,
 26 |                         help='input batch size for training')
 27 |     parser.add_argument('--dev_batch_size', type=int, default=8,
 28 |                         help='input batch size for development')
 29 |     parser.add_argument('--num_epochs_train', type=int, default=16,
 30 |                         help='number of epochs to train')
 31 |     parser.add_argument('--max_seq_length', type=int, default=256,
 32 |                         help='maximum sequence length of samples in a batch for training')
 33 |     parser.add_argument('--lr', type=float, default=2e-5,
 34 |                         help='learning rate')
 35 |     parser.add_argument('--warmup_ratio', type=float, default=0.2,
 36 |                         help='warmup_ratio')
 37 |     parser.add_argument('--early_stop', type=int, default=20,
 38 |                         help='early_stop')
 39 |     parser.add_argument('--scheduler', type=str, default="warmuplinear",
 40 |                         help='scheduler')
 41 |     parser.add_argument('--seed', type=int, default=122,
 42 |                         help='random seed')
 43 |     parser.add_argument('--accumulation_steps', type=int, default=1,
 44 |                         help='accumulation steps if you want large batch size but can not fit in the memory allowed')
 45 |     parser.add_argument('--n_gpu', type=int, default=1,
 46 |                         help='number of gpu for training')
 47 |     parser.add_argument('--visiable_device', type=str, default="0",
 48 |                         help='visiable gpus for training, should be consistent with n_gpu')
 49 |     parser.add_argument('--evaluation_steps', type=int, default=200,
 50 |                         help='evaluation_steps')
 51 |     parser.add_argument('--wandb_project_name', type=str, default="code_generate",
 52 |                         help='project name for wandb')
 53 |     parser.add_argument(
 54 |         "--restore_training", action="store_true", help="restore training if a saved checkopint exists"
 55 |     )
 56 |     parser.add_argument(
 57 |         "--with_wandb", action="store_true", help="Train with wandb tracking."
 58 |     )
 59 | 
 60 |     args = parser.parse_args()
 61 |     logger.info(f"args: {args}")
 62 |     dataset_folder = f"dataset/{args.dataset_name}/json/"
 63 |     assert args.model_select in MODEL_MAP.keys(), (f"model has to be in {MODEL_MAP.keys()}")
 64 |     output_path = f"model/{args.model_select}_fine_tuned_coder"
 65 |     logger.info("{} for dataset in: {}".format(output_path, dataset_folder))
 66 |     logger.info(
 67 |         f"*****************model select: {args.model_select} for code generation using dataset: {args.dataset_name}******************")
 68 |     # add more params for wandb
 69 |     args.wandb_run_name = output_path
 70 |     #initialize model by model name (the same as used in transformers lib)
 71 |     model = GPTSingleHead(MODEL_MAP[args.model_select], max_seq_length=args.max_seq_length)
 72 |     #add special tokens for controlling code generation by different programming language
 73 |     model.add_special_words({"pad_token": "<pad>", "additional_special_tokens": ["<python>", "<java>"]})
 74 |     #load training dataset
 75 |     file_path = dataset_folder + "train.jsonl"
 76 |     train_dataset = SrcCodeDataset(file_path, model, cache_path=os.path.join(".cache", output_path, "train"))
 77 |     #load developlemt dataset
 78 |     file_path = dataset_folder + "dev.jsonl"
 79 |     dev_dataset = SrcCodeDataset(file_path, model, cache_path=os.path.join(".cache", output_path, "dev"))
 80 |     # initialize development evaluator
 81 |     dev_evaluator = SingleCLMEvaluator()
 82 |     # initialize model trainer
 83 |     model_trainer = ModelTrainer(model,
 84 |                                  train_dataset=train_dataset,
 85 |                                  dev_dataset=dev_dataset,
 86 |                                  dev_evaluator=dev_evaluator,
 87 |                                  scheduler=args.scheduler,
 88 |                                  epochs=args.num_epochs_train,
 89 |                                  per_gpu_train_batch_size=args.per_gpu_train_batch_size,
 90 |                                  output_path=output_path,
 91 |                                  optimizer_params={'lr': args.lr, 'eps': 1e-6, 'correct_bias': False},
 92 |                                  evaluation_steps=args.evaluation_steps,
 93 |                                  early_stop=args.early_stop,
 94 |                                  dev_batch_size=args.dev_batch_size,
 95 |                                  restore_training=args.restore_training,
 96 |                                  accumulation_steps=args.accumulation_steps,
 97 |                                  n_gpu=args.n_gpu,
 98 |                                  visiable_device=args.visiable_device,
 99 |                                  warmup_ratio=args.warmup_ratio,
100 |                                  seed=args.seed,
101 |                                  data_loader_shuffle=True,
102 |                                  wandb_config=args if args.with_wandb else None)
103 |     #start training
104 |     model_trainer.train()
105 | 


--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import os
  4 | import shutil
  5 | import sys
  6 | from typing import Type, Dict
  7 | 
  8 | import torch
  9 | import transformers
 10 | 
 11 | try:
 12 |     from torch.utils.tensorboard import SummaryWriter
 13 | except ImportError:
 14 |     from tensorboardX import SummaryWriter
 15 | from torch import nn
 16 | from torch.utils.data import DataLoader
 17 | from torch.utils import data
 18 | from torch.optim.optimizer import Optimizer
 19 | from tqdm import trange, tqdm
 20 | 
 21 | from dateutil.relativedelta import relativedelta
 22 | 
 23 | import random
 24 | import numpy as np
 25 | import logging
 26 | from model import EmptyHeads
 27 | 
 28 | logging.basicConfig(
 29 |     format=logging.BASIC_FORMAT,
 30 |     datefmt='%Y-%m-%d %H:%M:%S',
 31 |     level=logging.INFO
 32 | )
 33 | logger = logging.getLogger(__name__)
 34 | 
 35 | from datetime import datetime
 36 | 
 37 | try:
 38 |     import wandb
 39 | 
 40 |     wandb.ensure_configured()
 41 |     if wandb.api.api_key is None:
 42 |         _has_wandb = False
 43 |         wandb.termwarn("W&B installed but not logged in.  Run `wandb login` or set the WANDB_API_KEY env variable.")
 44 |     else:
 45 |         _has_wandb = False if os.getenv("WANDB_DISABLED") else True
 46 | except ImportError:
 47 |     _has_wandb = False
 48 | 
 49 | 
 50 | def set_seed(seed, n_gpu):
 51 |     logger.info(f"   see seed for random, numpy and torch {seed}")
 52 |     random.seed(seed)
 53 |     np.random.seed(seed)
 54 |     torch.manual_seed(seed)
 55 |     if n_gpu > 0:
 56 |         torch.cuda.manual_seed_all(seed)
 57 | 
 58 | 
 59 | def print_model_state_dict(model):
 60 |     for param_tensor in model.state_dict():
 61 |         logger.info(f"{param_tensor}\t{model.state_dict()[param_tensor].size()}")
 62 | 
 63 | 
 64 | def print_optimizer_state_dict(optimizer):
 65 |     for var_name in optimizer.state_dict():
 66 |         logger.info(f"{var_name}\t{optimizer.state_dict()[var_name]}")
 67 | 
 68 | 
 69 | def count_params(model: torch.nn.Module, print_details: bool = False):
 70 |     trainable_count = 0
 71 |     total_count = 0
 72 |     if isinstance(model, torch.nn.Sequential):
 73 |         for index in model._modules:
 74 |             if print_details:
 75 |                 print_model_state_dict(model._modules[index])
 76 |                 logger.info(model._modules[index])
 77 |             trainable_count += sum(p.numel() for p in model._modules[index].parameters() if p.requires_grad)
 78 |             total_count += sum(p.numel() for p in model._modules[index].parameters())
 79 |     else:
 80 |         if print_details:
 81 |             print_model_state_dict(model)
 82 |             logger.info(model)
 83 |         total_count = sum(p.numel() for p in model.parameters())
 84 |         trainable_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
 85 |     logger.info(f'  Total params: {total_count}')
 86 |     logger.info(f'  Trainable params: {trainable_count}')
 87 |     logger.info(f'  Non-trainable params: {total_count - trainable_count}')
 88 | 
 89 | 
 90 | def batch_to_device(batch, device, keep_label=False):
 91 |     features = batch['features']
 92 |     if isinstance(features, dict):
 93 |         for feature_name in features:
 94 |             features[feature_name] = features[feature_name].to(device)
 95 |     else:
 96 |         for inx in range(len(features)):
 97 |             for feature_name in features[inx]:
 98 |                 features[inx][feature_name] = features[inx][feature_name].to(device)
 99 | 
100 |     label_space = batch['labels']
101 |     if label_space == None:  # for tasks like lm, labels are none.
102 |         return features, None
103 |     if not keep_label:
104 |         labels = {"label_space_" + str(inx): label_space[inx].to(device) if torch.is_tensor(label_space[inx]) else
105 |         label_space[inx] for inx in range(len(label_space))}
106 |     else:
107 |         labels = label_space
108 |     return features, labels
109 | 
110 | 
111 | def is_wandb_available():
112 |     return _has_wandb
113 | 
114 | 
115 | class CollateFunction():
116 |     def __init__(self, up_model):
117 |         self.up_model = up_model
118 | 
119 |     def __call__(self, batch):
120 |         if isinstance(batch[0], dict):
121 |             padded_features = self.up_model.padding_features(batch)
122 |             return {'features': padded_features,
123 |                     "labels": None}  # label_ids are in features, this task does not need labels, we set
124 | 
125 | 
126 | class ModelTrainer():
127 |     def __init__(self, up_model: nn.Module, down_layer: nn.Module = None, train_dataset=None,
128 |                  dev_dataset=None, dev_evaluator=None,
129 |                  epochs: int = 1,
130 |                  visiable_device: str = "0",
131 |                  scheduler: str = 'warmuplinear',
132 |                  warmup_ratio: float = 0.1,
133 |                  optimizer_class: Type[Optimizer] = transformers.AdamW,
134 |                  optimizer_params: Dict[str, object] = {'lr': 5e-5, 'eps': 1e-6, 'correct_bias': False},
135 |                  weight_decay: float = 0.01,
136 |                  early_stop: int = 20,
137 |                  # 20 evaluation steps without improving on the early_stop_on metric as specified in dev_evaluator
138 |                  evaluation_steps: int = 500,
139 |                  output_path: str = None,
140 |                  save_best_model: bool = True,
141 |                  max_grad_norm: float = 1,
142 |                  fp16: bool = False,
143 |                  accumulation_steps=1,
144 |                  fp16_opt_level: str = 'O1',
145 |                  seed: int = 122,
146 |                  data_loader_shuffle=True,
147 |                  device: str = None,
148 |                  dev_batch_size: int = -1,  # the same as train_batch_size
149 |                  n_gpu: int = None,
150 |                  report_model: bool = True,
151 |                  per_gpu_train_batch_size: int = 8,
152 |                  restore_training: bool = False,
153 |                  local_rank: int = -1,
154 |                  wandb_config=None):
155 |         """
156 |         this trainer is written for training a sequential model that contains an upstream_layer (usually transformers)
157 |         and a downstream_layer (usually task-specific heads like FF, RNN, CNN for encoding the output of upstram_layer)
158 | 
159 |         :param up_model: transformers like transformers.GPT2LMHeadModel or transformers.BERTModel
160 |         :param down_layer: None if up_model already wraps up with an output encoder such as LMHead in GPT2LMHeadModel, else nn.Module for encoding the output of up_model
161 |         :param train_dataset: train_dataset, it can be either instance of torch.data.Dataset or IterableDataset (defined in data.py)
162 |         :param dev_dataset: dev_dataset, it can be either instance of torch.data.Dataset or IterableDataset
163 |         :param dev_evaluator: dev_evaluator, evaluator on dev_dataset for early stop and performance tracking during training (defined in evaluate.py)
164 |         :param epochs: number of epoches for training
165 |         :param visiable_device: devices chosen to perform training
166 |         :param scheduler: scheduler specially from transformers: see options in self._get_scheduler
167 |         :param warmup_ratio: warmup_ratio ratio for learning rate over total training steps
168 |         :param optimizer_class: transformers.AdamW de byfault
169 |         :param optimizer_params: optimizer params
170 |         :param weight_decay:weight decay
171 |         :param early_stop:early stop steps
172 |         :param evaluation_steps:logging steps
173 |         :param output_path: path to save the checkpoint with the best performance as specified in early_stop_on in dev_evaluator instance
174 |         :param save_best_model:save best checkpoint or the latest checkpoint
175 |         :param max_grad_norm:max grad norm
176 |         :param fp16: fp16 training
177 |         :param accumulation_steps:accumulation steps
178 |         :param fp16_opt_level:fp16 opt level
179 |         :param seed:random seed for reproducibility
180 |         :param data_loader_shuffle:Whether to shuffle data_loader of training dataset and dev dataset after epoch ends
181 |         :param device: device for training, None or gpu for gpu training, cpu for gpu training
182 |         :param dev_batch_size: development batch size, usually larger than training batch size due to no grads calculation and hence less burden on memory
183 |         :param n_gpu: number of gpus for training
184 |         :param report_model:if report model's structure and number of trainable params in logging
185 |         :param per_gpu_train_batch_size: what it means literally
186 |         :param restore_training: if restore training if the training process is interupped due to some accidents
187 |         :param local_rank:for distributed training
188 |         :param wandb_config: wandb logging if not none, else without wandb logging
189 |         """
190 | 
191 |         self.up_model = up_model
192 |         if down_layer == None:
193 |             # In this example, the upstream_layer already integrate the downstream head (namely, simple LM head as in transformers.GPT2LMHeadModel)
194 |             # EmptyHeads is created here only for placeholder purpose
195 |             down_layer = EmptyHeads()
196 | 
197 |         self.down_layer = down_layer
198 |         assert output_path != None
199 |         output_path = os.path.join("tmp", output_path)
200 |         # os.makedirs(output_path,exist_ok=True)
201 |         if restore_training:
202 |             if not os.listdir(output_path):
203 |                 raise ValueError(f"no checkpoint found in {output_path}")
204 |             else:
205 |                 logger.info("   loading embedding weights from saved checkpoint")
206 |                 self.up_model = self.up_model.reload(
207 |                     output_path)  # for other transformers (apart from bert), the load_saved function has not been added
208 | 
209 |                 logger.info("   loading downstream weights from saved checkpoint")
210 |                 self.down_layer.load_saved(output_path)
211 |                 with open(output_path + "/ck_report.json") as f:
212 |                     self.ck_report = json.load(f)
213 | 
214 |         self.model = torch.nn.Sequential(self.up_model, self.down_layer)
215 | 
216 |         if is_wandb_available() and wandb_config != None:
217 |             # keep track of model topology and gradients if is_wandb_available and args!=None
218 |             wandb.init(project=wandb_config.wandb_project_name, config=wandb_config, name=wandb_config.wandb_run_name)
219 |             wandb.watch(
220 |                 (self.up_model, self.down_layer), log_freq=max(100, evaluation_steps)
221 |             )
222 |         self.wandb_config = wandb_config
223 | 
224 |         self._restore_training = restore_training
225 |         self.early_stop = early_stop
226 | 
227 |         self._dev_evaluator = dev_evaluator
228 | 
229 |         self._evaluation_steps = evaluation_steps
230 |         self._save_best_model = save_best_model
231 |         self._max_grad_norm = max_grad_norm
232 | 
233 |         os.makedirs(output_path, exist_ok=True)
234 |         if os.listdir(output_path) and not restore_training:
235 |             out = input(
236 |                 "Output directory ({}) already exists and is not empty, you wanna remove it before start? (y/n)".format(
237 |                     output_path))
238 |             if out == "y":
239 |                 shutil.rmtree(output_path)
240 |                 os.makedirs(output_path, exist_ok=True)
241 |             else:
242 |                 raise ValueError("Output directory ({}) already exists and is not empty".format(
243 |                     output_path))
244 | 
245 |         logFormatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
246 |         fileHandler = logging.FileHandler(os.path.join(output_path, "log.out"), mode="a")
247 |         fileHandler.setFormatter(logFormatter)
248 |         logger.addHandler(fileHandler)
249 |         self._dev_evaluator.reset_logger(output_path)
250 | 
251 |         self.output_path = output_path
252 | 
253 |         if device is None or device == "cuda":
254 |             if torch.cuda.is_available():
255 |                 device = torch.device("cuda")
256 |                 n_gpu = 1 if n_gpu == 1 else torch.cuda.device_count()
257 |             else:
258 |                 logger.warning("no cuda is found in your machine, now use cpu")
259 |                 device = torch.device("cpu")
260 |                 n_gpu = 0
261 |         elif device == "cpu":
262 |             device = torch.device("cpu")
263 |             n_gpu = 0
264 |         else:
265 |             raise ValueError("set device to be None, cuda or cpu")
266 |         assert n_gpu <= torch.cuda.device_count()
267 | 
268 |         logger.info("Use pytorch device: {}, with gpu_number={}".format(device, n_gpu))
269 | 
270 |         self._train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)
271 |         self._dev_batch_size = dev_batch_size if dev_batch_size != -1 else self._train_batch_size
272 | 
273 |         if isinstance(train_dataset, data.IterableDataset):
274 |             self._train_dataloader = DataLoader(train_dataset, batch_size=None)
275 |             self._steps_per_epoch = len(self._train_dataloader.dataset)
276 |         else:
277 |             self._train_dataloader = DataLoader(train_dataset, shuffle=data_loader_shuffle,
278 |                                                 batch_size=self._train_batch_size)
279 |             self._steps_per_epoch = len(self._train_dataloader)
280 | 
281 |         if isinstance(dev_dataset, data.IterableDataset):
282 |             dev_dataloader = DataLoader(dev_dataset, batch_size=None)
283 |         else:
284 |             dev_dataloader = DataLoader(dev_dataset, shuffle=data_loader_shuffle, batch_size=self._dev_batch_size)
285 | 
286 |         if accumulation_steps > 1:
287 |             self._steps_per_epoch = self._steps_per_epoch // accumulation_steps
288 | 
289 |         self._dev_data = dev_dataset
290 |         self._dev_evaluator.reset_dataloader(dev_dataloader)
291 | 
292 |         self.collate_fn = CollateFunction(self.up_model)
293 |         # Use customize batching
294 |         self._train_dataloader.collate_fn = self.collate_fn
295 | 
296 |         self._train_data = train_dataset
297 |         self._per_gpu_train_batch_size = per_gpu_train_batch_size
298 | 
299 |         set_seed(seed, n_gpu)
300 | 
301 |         if n_gpu > 1:
302 |             self.model = torch.nn.DataParallel(self.model, device_ids=[int(i) for i in visiable_device.split(',')])
303 |             self.model = self.model.to(f'cuda:{self.model.device_ids[0]}')
304 | 
305 |         elif n_gpu == 1:
306 |             self.model = self.model.to(device)
307 | 
308 |         self._device = device
309 |         self._n_gpu = n_gpu
310 | 
311 |         self._total_train_steps = int(self._steps_per_epoch * epochs)
312 |         self._epochs = epochs
313 | 
314 |         if report_model:
315 |             count_params(self.model, print_details=True)
316 | 
317 |         param_optimizer = list(self.model.named_parameters())
318 | 
319 |         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
320 |         optimizer_grouped_parameters = [
321 |             {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
322 |              'weight_decay': weight_decay},
323 |             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
324 |         ]
325 |         if local_rank != -1:
326 |             self._total_train_steps = self._total_train_steps // torch.distributed.get_world_size()
327 | 
328 |         self._optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
329 | 
330 |         warmup_steps = math.ceil(self._total_train_steps * warmup_ratio)  # by default 20% of train data for warm-up
331 |         logger.info(f"   Warmup-steps: {warmup_steps}")
332 | 
333 |         self._scheduler = self._get_scheduler(self._optimizer, scheduler=scheduler, warmup_steps=warmup_steps,
334 |                                               num_total=self._total_train_steps)
335 | 
336 |         if fp16:
337 |             try:
338 |                 from apex import amp
339 |             except ImportError:
340 |                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
341 | 
342 |             model, optimizer = amp.initialize(self.model, self._optimizer, opt_level=fp16_opt_level)
343 |             self.model = model
344 |             self._optimizer = optimizer
345 | 
346 |         self._fp16 = fp16
347 |         tb_writer = None
348 |         if local_rank in [-1, 0]:
349 |             tb_writer = SummaryWriter()
350 |         self._tb_writer = tb_writer
351 |         self._local_rank = local_rank
352 |         self._best_score = -float("inf")
353 |         self._early_stop_count = 0
354 |         self.last_time = datetime.now()
355 |         self.accumulation_steps = accumulation_steps
356 |         # assert evaluation_steps % accumulation_steps == 0, "evaluation_steps should be divisable by accumulation_steps"
357 | 
358 |     def _train_epoch(self, epoch: int, global_steps: int):
359 |         epoch_steps = 0
360 |         epoch_loss = 0.0
361 | 
362 |         self.model.zero_grad()
363 |         for step, data in enumerate(
364 |                 tqdm(self._train_dataloader, desc="training", total=self._steps_per_epoch * self.accumulation_steps)):
365 | 
366 |             self.model.train()
367 |             if data["labels"] != "skip-device":
368 |                 input, labels = batch_to_device(data, self._device)
369 |                 # add labels to input for training where this step is ignored when inference
370 |                 if isinstance(labels, dict):
371 |                     for idx in range(len(input)):
372 |                         input[idx].update(labels)
373 |             else:
374 |                 input = data["features"]
375 |             loss_value, _ = self.model(input)
376 | 
377 |             if self._n_gpu > 1:
378 |                 loss_value = loss_value.mean()
379 |             if self.accumulation_steps > 1:
380 |                 loss_value = loss_value / self.accumulation_steps
381 | 
382 |             if self._fp16:
383 |                 try:
384 |                     from apex import amp
385 |                 except ImportError:
386 |                     raise ImportError(
387 |                         "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
388 |                 with amp.scale_loss(loss_value, self._optimizer) as scaled_loss:
389 |                     scaled_loss.backward()
390 |                 torch.nn.utils.clip_grad_norm_(amp.master_params(self._optimizer), self._max_grad_norm)
391 |             else:
392 |                 loss_value.backward()
393 |                 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self._max_grad_norm)
394 |             epoch_loss += loss_value
395 | 
396 |             if (step + 1) % self.accumulation_steps == 0:
397 | 
398 |                 self._optimizer.step()
399 |                 self._scheduler.step()
400 |                 self.model.zero_grad()
401 | 
402 |                 epoch_steps += 1
403 |                 total_global = epoch_steps + global_steps
404 | 
405 |                 if self._evaluation_steps > 0 and (total_global) % self._evaluation_steps == 0:
406 |                     dev_loss, eval_scores = self._dev_eval_in_training(epoch, epoch_steps)
407 |                     logger.info("   ***** Evaluation report *****")
408 |                     logger.info(f"  Output path (short): {self.output_path}")
409 |                     logger.info(f"  Early stop on: {self._dev_evaluator.early_stop_on}")
410 |                     logger.info(f"  Early stop count = {self._early_stop_count}/{self.early_stop}")
411 |                     logger.info(
412 |                         f"  Eval steps = {self._evaluation_steps} or (iterations = {self._evaluation_steps * self.accumulation_steps})")
413 |                     logger.info(f"  Best score ({self._dev_evaluator.early_stop_on}) = {self._best_score}")
414 |                     logger.info(f"  Gradient Accumulation steps = {self.accumulation_steps}")
415 | 
416 |                     logger.info(
417 |                         f"  Num of training examples (actually no. of iterations per epoch for Iterable Dataset)  = {len(self._train_data)}")
418 |                     logger.info(
419 |                         f"  Num of development examples (actually no. of iterations per epoch for Iterable Dataset) = {len(self._dev_data)}")
420 |                     now_time = datetime.now()
421 |                     logger.info(f"  Time spent since last evaluation = {self.time_diff(self.last_time, now_time)}")
422 |                     self.last_time = now_time
423 | 
424 |                     logger.info(f"  Epoch = {epoch + 1}/{self._epochs}")
425 |                     logger.info(f"  Steps = {total_global}/{self._total_train_steps}")
426 |                     logger.info(
427 |                         f"  Instantaneous batch size per GPU = {self._per_gpu_train_batch_size} and n_gpu = {self._n_gpu} so the input batch size = {self._train_batch_size}")
428 |                     if dev_loss != None:
429 |                         logger.info(f"  dev_loss = {dev_loss:.6f}\t||\t dev_eval_scores = {eval_scores}")
430 |                     else:
431 |                         logger.info(f"  dev_eval_scores = {eval_scores}")
432 | 
433 |                     train_loss = epoch_loss / epoch_steps
434 |                     logger.info(f"  train_loss = {train_loss}")
435 |                     logger.info("\n********************************************")
436 | 
437 |                     if is_wandb_available() and self.wandb_config != None:
438 |                         if dev_loss != None:
439 |                             wandb.log(
440 |                                 {"loss_dev": dev_loss,
441 |                                  f"best_score_for_{self._dev_evaluator.early_stop_on}": self._best_score,
442 |                                  "loss_train": train_loss, "lr": self._scheduler.get_lr()[0]},
443 |                                 step=total_global)
444 |                         else:
445 |                             wandb.log({"loss_train": train_loss,
446 |                                        f"best_score_for_{self._dev_evaluator.early_stop_on}": self._best_score,
447 |                                        "lr": self._scheduler.get_lr()[0]},
448 |                                       step=total_global)
449 | 
450 |                     for key, value in eval_scores.items():
451 |                         if is_wandb_available() and self.wandb_config != None:
452 |                             wandb.log({f"eval_{key}_dev": value}, step=total_global)
453 |                         self._tb_writer.add_scalar(f"eval_{key}_dev", value, total_global)
454 | 
455 |                     self._tb_writer.add_scalar("lr", self._scheduler.get_lr()[0], total_global)
456 |                     if dev_loss != None:
457 |                         self._tb_writer.add_scalar("loss_dev", dev_loss, total_global)
458 | 
459 |                     self._tb_writer.add_scalar("loss_train", train_loss, total_global)
460 | 
461 |                     if self._early_stop_count >= self.early_stop:
462 |                         logger.info(
463 |                             f"  Continuous {self.early_stop} evaluation steps without loss reduction, so early stopped...")
464 |                         sys.exit(0)
465 | 
466 |         return epoch_loss, epoch_steps
467 | 
468 |     def train(self):
469 |         if self._restore_training:
470 |             logger.info(f"***** restoring training from the previous checkpoint: {self.ck_report}*****")
471 |         else:
472 |             logger.info("***** Running training *****")
473 |         logger.info(
474 |             f"  Num of training examples (actually iterations per epoch for Iterable Dataset) = {len(self._train_data)}")
475 |         logger.info(f"  Output path (short): {self.output_path}")
476 |         logger.info(
477 |             f"  Steps per Epoch = {self._steps_per_epoch} or iterations per epoch = {self._steps_per_epoch * self.accumulation_steps}")
478 |         logger.info(f"  Num of Epochs = {self._epochs}")
479 |         logger.info(f"  Best score ({self._dev_evaluator.early_stop_on}) = {self._best_score}")
480 |         logger.info(
481 |             f"  Eval every {self._evaluation_steps} steps or every {self._evaluation_steps * self.accumulation_steps} iterations")
482 |         logger.info(f"  Early stop = {self.early_stop}")
483 |         logger.info(f"  Gradient Accumulation steps = {self.accumulation_steps}")
484 | 
485 |         logger.info(f"  Total optimization steps = {self._total_train_steps}")
486 |         logger.info(
487 |             f"  Instantaneous batch size per GPU = {self._per_gpu_train_batch_size} and n_gpu = {self._n_gpu} so the input batch size = {self._train_batch_size}")
488 |         global_loss = 0.0
489 |         global_steps = 0
490 |         self.last_time = datetime.now()
491 |         for epoch in trange(self._epochs, desc="Epoch"):
492 |             epoch_loss, epoch_steps = self._train_epoch(epoch, global_steps)
493 |             global_loss += epoch_loss
494 |             global_steps += epoch_steps
495 |             logger.info(f"epoch {epoch + 1} ends, {self._epochs - epoch - 1} epoches left")
496 |             logger.info(
497 |                 f"\nglobal_average_loss={global_loss / global_steps},global_steps={global_steps} on training set")
498 | 
499 |         if self._local_rank in [-1, 0]:
500 |             self._tb_writer.close()
501 | 
502 |     def _dev_eval_in_training(self, epoch, steps):
503 |         return_scores = {}
504 |         if self._dev_evaluator is not None:
505 | 
506 |             return_scores = self._dev_evaluator(self.model, self.collate_fn,
507 |                                                 output_path=self.output_path, epoch=epoch, steps=steps)
508 | 
509 |             early_stop_on = self._dev_evaluator.early_stop_on
510 | 
511 |             check_score = -return_scores[early_stop_on] if early_stop_on == "loss" or early_stop_on == "perplexity" else \
512 |                 return_scores[early_stop_on]
513 |             if check_score >= self._best_score and self._save_best_model:
514 |                 eval_scores_transformed = {key:
515 |                                                return_scores[key].item() if torch.is_tensor(return_scores[key]) else
516 |                                                return_scores[key]
517 |                                            for key in return_scores.keys()}
518 |                 self.save(self.output_path,
519 |                           {"training_examples (when pos_num=1 for ranking)": len(self._train_data),
520 |                            "evaluation_steps": self._evaluation_steps,
521 |                            "train_batch_size": self._train_batch_size, "epoch": epoch + 1, "total_epochs": self._epochs,
522 |                            "steps": steps,
523 |                            "saved_at_total_steps": steps + epoch * self._steps_per_epoch,
524 |                            "steps_per_epoch": self._steps_per_epoch, "eval_scores_on_dev": eval_scores_transformed})
525 | 
526 |                 self._best_score = check_score
527 | 
528 |                 logger.info(f"  Save check-point at epoch={epoch} step={steps}")
529 |                 self._early_stop_count = 0
530 |             else:
531 |                 self._early_stop_count += 1
532 | 
533 |         return return_scores.pop("loss").item() if "loss" in return_scores else None, return_scores
534 | 
535 |     def save(self, path, eval_details):
536 |         if path is None:
537 |             return
538 |         logger.info(f"   Save model to {path}")
539 |         contained_modules = []
540 | 
541 |         to_iterate = self.model.module._modules if self._n_gpu > 1 else self.model._modules
542 | 
543 |         for idx, name in enumerate(to_iterate):
544 |             module = to_iterate[str(name)]
545 | 
546 |             model_path = os.path.join(path, str(idx) + "_" + type(module).__name__)
547 |             os.makedirs(model_path, exist_ok=True)
548 |             module.save(model_path)
549 |             contained_modules.append(
550 |                 {'idx': idx, 'name': name, 'path': os.path.basename(model_path), 'type': type(module).__module__})
551 | 
552 |         if self.wandb_config != None:
553 |             with open(os.path.join(path, 'hyperparams.json'), 'w') as f:
554 |                 json.dump(self.wandb_config.__dict__, f, indent=2)
555 | 
556 |         with open(os.path.join(path, 'modules.json'), 'w') as fOut:
557 |             json.dump(contained_modules, fOut, indent=2)
558 |         with open(os.path.join(path, 'ck_report.json'), 'w') as fOut:
559 |             json.dump(eval_details, fOut, indent=2)
560 | 
561 |     def _get_scheduler(self, optimizer, scheduler: str, warmup_steps: int, num_total: int):
562 |         assert scheduler in ["constantlr", "warmuplinear", "warmupconstant", "warmupcosine",
563 |                              "warmupcosinewithhardrestarts"], (
564 |             'scheduler should be one of ["constantlr","warmupconstant","warmupcosine","warmupcosinewithhardrestarts"]')
565 |         if scheduler == 'constantlr':
566 |             return transformers.get_constant_schedule(optimizer)
567 |         elif scheduler == 'warmupconstant':
568 |             return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
569 |         elif scheduler == 'warmuplinear':
570 |             return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
571 |                                                                 num_training_steps=num_total)
572 |         elif scheduler == 'warmupcosine':
573 |             return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
574 |                                                                 num_training_steps=num_total)
575 |         elif scheduler == 'warmupcosinewithhardrestarts':
576 |             return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,
577 |                                                                                    num_warmup_steps=warmup_steps,
578 |                                                                                    num_training_steps=num_total)
579 | 
580 |     def time_diff(self, t_a, t_b):
581 |         t_diff = relativedelta(t_b, t_a)  # later/end time comes first!
582 |         return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)
583 | 


--------------------------------------------------------------------------------