├── README.md
└── bert.py


/README.md:
--------------------------------------------------------------------------------
1 | # bert_chinese_pytorch
2 | 参考代码：https://github.com/huggingface/pytorch-pretrained-BERT
3 | 
4 | CSDN博文（施工中）：https://blog.csdn.net/Real_Brilliant/article/details/84880528
5 | 


--------------------------------------------------------------------------------
/bert.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import csv
  7 | import os
  8 | import codecs
  9 | import json
 10 | import random
 11 | import logging
 12 | import argparse
 13 | from tqdm import tqdm, trange
 14 | 
 15 | from sklearn import metrics
 16 | import numpy as np
 17 | import torch
 18 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 19 | from torch.utils.data.distributed import DistributedSampler
 20 | 
 21 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 22 | from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 23 | from pytorch_pretrained_bert.optimization import BertAdam
 24 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 25 | 
 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
 27 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 28 |                     level = logging.INFO)
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | class InputExample(object):
 33 |     """A single training/test example for simple sequence classification."""
 34 | 
 35 |     def __init__(self, guid, text_a, text_b=None, label=None):
 36 |         """Constructs a InputExample.
 37 |         Args:
 38 |             guid: Unique id for the example.
 39 |             text_a: string. The untokenized text of the first sequence. For single
 40 |             sequence tasks, only this sequence must be specified.
 41 |             text_b: (Optional) string. The untokenized text of the second sequence.
 42 |             Only must be specified for sequence pair tasks.
 43 |             label: (Optional) string. The label of the example. This should be
 44 |             specified for train and dev examples, but not for test examples.
 45 |         """
 46 |         self.guid = guid
 47 |         self.text_a = text_a
 48 |         self.text_b = text_b
 49 |         self.label = label
 50 | 
 51 | 
 52 | class InputFeatures(object):
 53 |     """A single set of features of data."""
 54 | 
 55 |     def __init__(self, input_ids, input_mask, segment_ids, label_id):
 56 |         self.input_ids = input_ids
 57 |         self.input_mask = input_mask
 58 |         self.segment_ids = segment_ids
 59 |         self.label_id = label_id
 60 | 
 61 | 
 62 | class DataProcessor(object):
 63 |     """Base class for data converters for sequence classification data sets."""
 64 | 
 65 |     def get_train_examples(self, data_dir):
 66 |         """Gets a collection of `InputExample`s for the train set."""
 67 |         raise NotImplementedError()
 68 | 
 69 |     def get_dev_examples(self, data_dir):
 70 |         """Gets a collection of `InputExample`s for the dev set."""
 71 |         raise NotImplementedError()
 72 | 
 73 |     def get_test_examples(self, data_dir):
 74 |         """Gets a collection of `InputExample`s for the test set."""
 75 |         raise NotImplementedError()
 76 | 
 77 |     def get_labels(self):
 78 |         """Gets the list of labels for this data set."""
 79 |         raise NotImplementedError()
 80 | 
 81 |     @classmethod
 82 |     def _read_json(cls, input_file, quotechar=None):
 83 |         """Reads a tab separated value file."""
 84 |         dicts = []
 85 |         with codecs.open(input_file, 'r', 'utf-8') as infs:
 86 |             for inf in infs:
 87 |                 inf = inf.strip()
 88 |                 dicts.append(json.loads(inf))
 89 |         return dicts
 90 | 
 91 | 
 92 | class MyPro(DataProcessor):
 93 |     '''自定义数据读取方法，针对json文件
 94 |     
 95 |     Returns:
 96 |         examples: 数据集，包含index、中文文本、类别三个部分
 97 |     '''
 98 |     def get_train_examples(self, data_dir):
 99 |         return self._create_examples(
100 |             self._read_json(os.path.join(data_dir, "train.json")), 'train')
101 | 
102 |     def get_dev_examples(self, data_dir):
103 |         return self._create_examples(
104 |             self._read_json(os.path.join(data_dir, "val.json")), 'dev')
105 | 
106 |     def get_test_examples(self, data_dir):
107 |         return self._create_examples(
108 |             self._read_json(os.path.join(data_dir, "test.json")), 'test')
109 | 
110 |     def get_labels(self):
111 |         return [0, 1]
112 | 
113 |     def _create_examples(self, dicts, set_type):
114 |         examples = []
115 |         for (i, infor) in enumerate(dicts):
116 |             guid = "%s-%s" % (set_type, i)
117 |             text_a = infor['question']
118 |             label = infor['label']
119 |             examples.append(
120 |                 InputExample(guid=guid, text_a=text_a, label=label))
121 |         return examples
122 | 
123 | 
124 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, show_exp=True):
125 |     '''Loads a data file into a list of `InputBatch`s.
126 | 
127 |     Args:
128 |         examples      : [List] 输入样本，包括question, label, index
129 |         label_list    : [List] 所有可能的类别，可以是int、str等，如['book', 'city', ...]
130 |         max_seq_length: [int] 文本最大长度
131 |         tokenizer     : [Method] 分词方法
132 | 
133 |     Returns:
134 |         features:
135 |             input_ids  : [ListOf] token的id，在chinese模式中就是每个分词的id，对应一个word vector
136 |             input_mask : [ListOfInt] 真实字符对应1，补全字符对应0
137 |             segment_ids: [ListOfInt] 句子标识符，第一句全为0，第二句全为1
138 |             label_id   : [ListOfInt] 将Label_list转化为相应的id表示
139 |     '''
140 |     label_map = {}
141 |     for (i, label) in enumerate(label_list):
142 |         label_map[label] = i
143 | 
144 |     features = []
145 |     for (ex_index, example) in enumerate(examples):
146 |         tokens_a = tokenizer.tokenize(example.text_a)
147 | 
148 |         tokens_b = None
149 |         if example.text_b:
150 |             tokens_b = tokenizer.tokenize(example.text_b)
151 | 
152 |         if tokens_b:
153 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
154 |             # length is less than the specified length.
155 |             # Account for [CLS], [SEP], [SEP] with "- 3"
156 |             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
157 |         else:
158 |             # Account for [CLS] and [SEP] with "- 2"
159 |             if len(tokens_a) > max_seq_length - 2:
160 |                 tokens_a = tokens_a[0:(max_seq_length - 2)]
161 | 
162 |         # The convention in BERT is:
163 |         # (a) For sequence pairs:
164 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
165 |         #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
166 |         # (b) For single sequences:
167 |         #  tokens:   [CLS] the dog is hairy . [SEP]
168 |         #  type_ids: 0   0   0   0  0     0 0
169 |         #
170 |         # Where "type_ids" are used to indicate whether this is the first
171 |         # sequence or the second sequence. The embedding vectors for `type=0` and
172 |         # `type=1` were learned during pre-training and are added to the wordpiece
173 |         # embedding vector (and position vector). This is not *strictly* necessary
174 |         # since the [SEP] token unambigiously separates the sequences, but it makes
175 |         # it easier for the model to learn the concept of sequences.
176 |         #
177 |         # For classification tasks, the first vector (corresponding to [CLS]) is
178 |         # used as as the "sentence vector". Note that this only makes sense because
179 |         # the entire model is fine-tuned.
180 |         tokens = []
181 |         segment_ids = []
182 |         tokens.append("[CLS]")
183 |         segment_ids.append(0)
184 |         for token in tokens_a:
185 |             tokens.append(token)
186 |             segment_ids.append(0)
187 |         tokens.append("[SEP]")
188 |         segment_ids.append(0)
189 | 
190 |         if tokens_b:
191 |             for token in tokens_b:
192 |                 tokens.append(token)
193 |                 segment_ids.append(1)
194 |             tokens.append("[SEP]")
195 |             segment_ids.append(1)
196 | 
197 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
198 | 
199 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
200 |         # tokens are attended to.
201 |         input_mask = [1] * len(input_ids)
202 | 
203 |         # Zero-pad up to the sequence length.
204 |         while len(input_ids) < max_seq_length:
205 |             input_ids.append(0)
206 |             input_mask.append(0)
207 |             segment_ids.append(0)
208 | 
209 |         assert len(input_ids) == max_seq_length
210 |         assert len(input_mask) == max_seq_length
211 |         assert len(segment_ids) == max_seq_length
212 | 
213 |         label_id = label_map[example.label]
214 |         if ex_index < 5 and show_exp:
215 |             logger.info("*** Example ***")
216 |             logger.info("guid: %s" % (example.guid))
217 |             logger.info("tokens: %s" % " ".join(
218 |                     [str(x) for x in tokens]))
219 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
220 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
221 |             logger.info(
222 |                     "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
223 |             logger.info("label: %s (id = %d)" % (example.label, label_id))
224 | 
225 |         features.append(
226 |                 InputFeatures(input_ids=input_ids,
227 |                               input_mask=input_mask,
228 |                               segment_ids=segment_ids,
229 |                               label_id=label_id))
230 |     return features
231 | 
232 | 
233 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
234 |     """Truncates a sequence pair in place to the maximum length."""
235 | 
236 |     # This is a simple heuristic which will always truncate the longer sequence
237 |     # one token at a time. This makes more sense than truncating an equal percent
238 |     # of tokens from each, since if one sequence is very short then each token
239 |     # that's truncated likely contains more information than a longer sequence.
240 |     while True:
241 |         total_length = len(tokens_a) + len(tokens_b)
242 |         if total_length <= max_length:
243 |             break
244 |         if len(tokens_a) > len(tokens_b):
245 |             tokens_a.pop()
246 |         else:
247 |             tokens_b.pop()
248 | 
249 | 	
250 | def accuracy(out, labels):
251 |     outputs = np.argmax(out, axis=1)
252 |     return np.sum(outputs == labels)
253 | 
254 | 
255 | def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
256 |     """ Utility function for optimize_on_cpu and 16-bits training.
257 |         Copy the parameters optimized on CPU/RAM back to the model on GPU
258 |     """
259 |     for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
260 |         if name_opti != name_model:
261 |             logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
262 |             raise ValueError
263 |         param_model.data.copy_(param_opti.data)
264 | 
265 | 	
266 | def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
267 |     """ Utility function for optimize_on_cpu and 16-bits training.
268 |         Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
269 |     """
270 |     is_nan = False
271 |     for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
272 |         if name_opti != name_model:
273 |             logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
274 |             raise ValueError
275 |         if param_model.grad is not None:
276 |             if test_nan and torch.isnan(param_model.grad).sum() > 0:
277 |                 is_nan = True
278 |             if param_opti.grad is None:
279 |                 param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
280 |             param_opti.grad.data.copy_(param_model.grad.data)
281 |         else:
282 |             param_opti.grad = None
283 |     return is_nan
284 | 
285 | 
286 | def val(model, processor, args, label_list, tokenizer, device):
287 |     '''模型验证
288 |     
289 |     Args:
290 |         model: 模型
291 | 	processor: 数据读取方法
292 | 	args: 参数表
293 | 	label_list: 所有可能类别
294 | 	tokenizer: 分词方法
295 | 	device
296 | 	
297 |     Returns:
298 |         f1: F1值
299 |     '''
300 |     eval_examples = processor.get_dev_examples(args.data_dir)
301 |     eval_features = convert_examples_to_features(
302 |         eval_examples, label_list, args.max_seq_length, tokenizer, show_exp=False)
303 |     all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
304 |     all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
305 |     all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
306 |     all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
307 |     eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
308 |     # Run prediction for full data
309 |     eval_sampler = SequentialSampler(eval_data)
310 |     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
311 |     
312 |     model.eval()
313 |     predict = np.zeros((0,), dtype=np.int32)
314 |     gt = np.zeros((0,), dtype=np.int32)
315 |     for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
316 |         input_ids = input_ids.to(device)
317 |         input_mask = input_mask.to(device)
318 |         segment_ids = segment_ids.to(device)
319 |         label_ids = label_ids.to(device)
320 |  
321 |         with torch.no_grad():
322 |             logits = model(input_ids, segment_ids, input_mask)         
323 |             pred = logits.max(1)[1]
324 |             predict = np.hstack((predict, pred.cpu().numpy()))
325 |             gt = np.hstack((gt, label_ids.cpu().numpy()))
326 | 
327 |         logits = logits.detach().cpu().numpy()
328 |         label_ids = label_ids.to('cpu').numpy()
329 | 
330 |     print(len(gt))
331 |     f1 = np.mean(metrics.f1_score(predict, gt, average=None))
332 |     print(f1)
333 | 
334 |     return f1
335 | 
336 | 
337 | def test(model, processor, args, label_list, tokenizer, device):
338 |     '''模型测试
339 |     
340 |     Args:
341 |         model: 模型
342 | 	processor: 数据读取方法
343 | 	args: 参数表
344 | 	label_list: 所有可能类别
345 | 	tokenizer: 分词方法
346 | 	device
347 | 	
348 |     Returns:
349 |         f1: F1值
350 |     '''
351 |     test_examples = processor.get_test_examples(args.data_dir)
352 |     test_features = convert_examples_to_features(
353 |         test_examples, label_list, args.max_seq_length, tokenizer)
354 |     all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
355 |     all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
356 |     all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
357 |     all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
358 |     test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
359 |     # Run prediction for full data
360 |     test_sampler = SequentialSampler(test_data)
361 |     test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size)
362 |     
363 |     model.eval()
364 |     predict = np.zeros((0,), dtype=np.int32)
365 |     gt = np.zeros((0,), dtype=np.int32)
366 |     for input_ids, input_mask, segment_ids, label_ids in test_dataloader:
367 |         input_ids = input_ids.to(device)
368 |         input_mask = input_mask.to(device)
369 |         segment_ids = segment_ids.to(device)
370 |         label_ids = label_ids.to(device)
371 |  
372 |         with torch.no_grad():
373 |             logits = model(input_ids, segment_ids, input_mask)         
374 |             pred = logits.max(1)[1]
375 |             predict = np.hstack((predict, pred.cpu().numpy()))
376 |             gt = np.hstack((gt, label_ids.cpu().numpy()))
377 | 
378 |         logits = logits.detach().cpu().numpy()
379 |         label_ids = label_ids.to('cpu').numpy()
380 | 
381 |     f1 = np.mean(metrics.f1_score(predict, gt, average=None))
382 |     print('F1 score in text set is {}'.format(f1))
383 | 
384 |     return f1
385 | 
386 | 
387 | def main():
388 |     # ArgumentParser对象保存了所有必要的信息，用以将命令行参数解析为相应的python数据类型
389 |     parser = argparse.ArgumentParser()
390 | 
391 |     # required parameters
392 |     # 调用add_argument()向ArgumentParser对象添加命令行参数信息，这些信息告诉ArgumentParser对象如何处理命令行参数
393 |     parser.add_argument("--data_dir",
394 |                         default = '/data/users/zfsun3/text_classification/data/',
395 |                         type = str,
396 |                         #required = True,
397 |                         help = "The input data dir. Should contain the .tsv files (or other data files) for the task.")
398 |     parser.add_argument("--bert_model",
399 |                         default = 'bert-base-chinese',
400 |                         type = str,
401 |                         #required = True,
402 |                         help = "choose [bert-base-chinese] mode.")
403 |     parser.add_argument("--task_name",
404 |                         default = 'MyPro',
405 |                         type = str,
406 |                         #required = True,
407 |                         help = "The name of the task to train.")
408 |     parser.add_argument("--output_dir",
409 |                         default = 'checkpoints/',
410 |                         type = str,
411 |                         #required = True,
412 |                         help = "The output directory where the model checkpoints will be written")
413 |     parser.add_argument("--model_save_pth",
414 |                         default = 'checkpoints/bert_classification.pth',
415 |                         type = str,
416 |                         #required = True,
417 |                         help = "The output directory where the model checkpoints will be written")
418 | 
419 |     # other parameters
420 |     parser.add_argument("--max_seq_length",
421 |                         default = 22,
422 |                         type = int,
423 |                         help = "字符串最大长度")
424 |     parser.add_argument("--do_train",
425 |                         default = True,
426 |                         action = 'store_true',
427 |                         help = "训练模式")
428 |     parser.add_argument("--do_eval",
429 |                         default = True,
430 |                         action = 'store_true',
431 |                         help = "验证模式")
432 |     parser.add_argument("--do_lower_case",
433 |                         default = False,
434 |                         action = 'store_true',
435 |                         help = "英文字符的大小写转换，对于中文来说没啥用")
436 |     parser.add_argument("--train_batch_size",
437 |                         default = 128,
438 |                         type = int,
439 |                         help = "训练时batch大小")
440 |     parser.add_argument("--eval_batch_size",
441 |                         default = 1,
442 |                         type = int,
443 |                         help = "验证时batch大小")
444 |     parser.add_argument("--learning_rate",
445 |                         default = 5e-5,
446 |                         type = float,
447 |                         help = "Adam初始学习步长")
448 |     parser.add_argument("--num_train_epochs",
449 |                         default = 10.0,
450 |                         type = float,
451 |                         help = "训练的epochs次数")
452 |     parser.add_argument("--warmup_proportion",
453 |                         default = 0.1,
454 |                         type = float,
455 |                         help = "Proportion of training to perform linear learning rate warmup for."
456 |                                "E.g., 0.1 = 10%% of training.")
457 |     parser.add_argument("--no_cuda",
458 |                         default = False,
459 |                         action = 'store_true',
460 |                         help = "用不用CUDA")
461 |     parser.add_argument("--local_rank",
462 |                         default = -1,
463 |                         type = int,
464 |                         help = "local_rank for distributed training on gpus.")
465 |     parser.add_argument("--seed",
466 |                         default = 777,
467 |                         type = int,
468 |                         help = "初始化时的随机数种子")
469 |     parser.add_argument("--gradient_accumulation_steps",
470 |                         default = 1,
471 |                         type = int,
472 |                         help = "Number of updates steps to accumulate before performing a backward/update pass.")
473 |     parser.add_argument("--optimize_on_cpu",
474 |                         default = False,
475 |                         action = 'store_true',
476 |                         help = "Whether to perform optimization and keep the optimizer averages on CPU.")
477 |     parser.add_argument("--fp16",
478 |                         default = False,
479 |                         action = 'store_true',
480 |                         help = "Whether to use 16-bit float precision instead of 32-bit.")
481 |     parser.add_argument("--loss_scale",
482 |                         default = 128,
483 |                         type = float,
484 |                         help = "Loss scaling, positive power of 2 values can improve fp16 convergence.")
485 | 
486 |     args = parser.parse_args()
487 | 
488 |     # 对模型输入进行处理的processor，git上可能都是针对英文的processor
489 |     processors = {'mypro': MyPro}
490 | 
491 |     if args.local_rank == -1 or args.no_cuda:
492 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
493 |         n_gpu = torch.cuda.device_count()
494 |     else:
495 |         device = torch.device("cuda", args.local_rank)
496 |         n_gpu = 1
497 |         torch.distributed.init_process_group(backend='nccl')
498 |         if args.fp16:
499 |             logger.info("16-bits training currently not supported in distributed training") 
500 |             args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
501 |     logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
502 | 
503 |     if args.gradient_accumulation_steps < 1:
504 |         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
505 |                             args.gradient_accumulation_steps))
506 |     
507 |     args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
508 | 
509 |     random.seed(args.seed)
510 |     np.random.seed(args.seed)
511 |     torch.manual_seed(args.seed)
512 |     if n_gpu > 0:
513 |         torch.cuda.manual_seed_all(args.seed)
514 | 
515 |     if not args.do_train and not args.do_eval:
516 |         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
517 | 
518 |     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
519 |         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
520 |     os.makedirs(args.output_dir, exist_ok=True)
521 | 
522 |     task_name = args.task_name.lower()
523 | 
524 |     if task_name not in processors:
525 |         raise ValueError("Task not found: %s" % (task_name))
526 |     
527 |     processor = processors[task_name]()
528 |     label_list = processor.get_labels()
529 | 
530 |     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
531 | 
532 |     train_examples = None
533 |     num_train_steps = None
534 |     if args.do_train:
535 |         train_examples = processor.get_train_examples(args.data_dir)
536 |         num_train_steps = int(
537 |             len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
538 | 
539 |     # Prepare model
540 |     model = BertForSequenceClassification.from_pretrained(args.bert_model, 
541 |                 cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
542 | 
543 |     if args.fp16:
544 |     	model.half()
545 |     model.to(device)
546 |     if args.local_rank != -1:
547 |         model = torch.nn.parallel.DistributedDataParallel(model, 
548 |                                                           device_ids=[args.local_rank],
549 |                                                           output_device=args.local_rank)
550 |     elif n_gpu > 1:
551 |         model = torch.nn.DataParallel(model)
552 | 
553 |     # Prepare optimizer
554 |     if args.fp16:
555 |         param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
556 |                             for n, param in model.named_parameters()]
557 |     elif args.optimize_on_cpu:
558 |         param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
559 |                             for n, param in model.named_parameters()]
560 |     else:
561 |     	param_optimizer = list(model.named_parameters())
562 |     no_decay = ['bias', 'gamma', 'beta']
563 |     optimizer_grouped_parameters = [
564 |         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
565 |         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
566 |         ]
567 |     t_total = num_train_steps
568 |     if args.local_rank != -1:
569 |         t_total = t_total // torch.distributed.get_world_size()
570 |     optimizer = BertAdam(optimizer_grouped_parameters,
571 |                          lr=args.learning_rate,
572 |                          warmup=args.warmup_proportion,
573 |                          t_total=t_total)
574 | 
575 |     global_step = 0
576 |     if args.do_train:
577 |         train_features = convert_examples_to_features(
578 |             train_examples, label_list, args.max_seq_length, tokenizer, show_exp=False)
579 |         logger.info("***** Running training *****")
580 |         logger.info("  Num examples = %d", len(train_examples))
581 |         logger.info("  Batch size = %d", args.train_batch_size)
582 |         logger.info("  Num steps = %d", num_train_steps)
583 |         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
584 |         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
585 |         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
586 |         all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
587 |         train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
588 |         if args.local_rank == -1:
589 |             train_sampler = RandomSampler(train_data)
590 |         else:
591 |             train_sampler = DistributedSampler(train_data)
592 |         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
593 | 
594 |         model.train()
595 |         best_score = 0
596 |         flags = 0
597 |         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
598 |             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
599 |                 batch = tuple(t.to(device) for t in batch)
600 |                 input_ids, input_mask, segment_ids, label_ids = batch
601 |                 loss = model(input_ids, segment_ids, input_mask, label_ids)
602 |                 if n_gpu > 1:
603 |                     loss = loss.mean() # mean() to average on multi-gpu.
604 |                 if args.fp16 and args.loss_scale != 1.0:
605 |                     # rescale loss for fp16 training
606 |                     # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
607 |                     loss = loss * args.loss_scale
608 |                 if args.gradient_accumulation_steps > 1:
609 |                     loss = loss / args.gradient_accumulation_steps
610 |                 loss.backward()
611 |                 
612 |                 if (step + 1) % args.gradient_accumulation_steps == 0:
613 |                     if args.fp16 or args.optimize_on_cpu:
614 |                         if args.fp16 and args.loss_scale != 1.0:
615 |                             # scale down gradients for fp16 training
616 |                             for param in model.parameters():
617 |                                 if param.grad is not None:
618 |                                     param.grad.data = param.grad.data / args.loss_scale
619 |                         is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
620 |                         if is_nan:
621 |                             logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
622 |                             args.loss_scale = args.loss_scale / 2
623 |                             model.zero_grad()
624 |                             continue
625 |                         optimizer.step()
626 |                         copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
627 |                     else:
628 |                         optimizer.step()
629 |                     model.zero_grad()
630 |     
631 |             f1 = val(model, processor, args, label_list, tokenizer, device)
632 |             if f1 > best_score:
633 |                 best_score = f1
634 |                 print('*f1 score = {}'.format(f1))
635 |                 flags = 0
636 |                 checkpoint = {
637 |                     'state_dict': model.state_dict()
638 |                 }
639 |                 torch.save(checkpoint, args.model_save_pth)
640 |             else:
641 |                 print('f1 score = {}'.format(f1))
642 |                 flags += 1
643 |                 if flags >=6:
644 |                     break
645 | 
646 |     model.load_state_dict(torch.load(args.model_save_pth)['state_dict'])
647 |     test(model, processor, args, label_list, tokenizer, device)
648 | 	
649 | 
650 | if __name__ == '__main__':
651 | 	main()
652 | 


--------------------------------------------------------------------------------