├── README.md └── bert.py /README.md: -------------------------------------------------------------------------------- 1 | # bert_chinese_pytorch 2 | 参考代码:https://github.com/huggingface/pytorch-pretrained-BERT 3 | 4 | CSDN博文(施工中):https://blog.csdn.net/Real_Brilliant/article/details/84880528 5 | -------------------------------------------------------------------------------- /bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import csv 7 | import os 8 | import codecs 9 | import json 10 | import random 11 | import logging 12 | import argparse 13 | from tqdm import tqdm, trange 14 | 15 | from sklearn import metrics 16 | import numpy as np 17 | import torch 18 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 19 | from torch.utils.data.distributed import DistributedSampler 20 | 21 | from pytorch_pretrained_bert.tokenization import BertTokenizer 22 | from pytorch_pretrained_bert.modeling import BertForSequenceClassification 23 | from pytorch_pretrained_bert.optimization import BertAdam 24 | from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 25 | 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 27 | datefmt = '%m/%d/%Y %H:%M:%S', 28 | level = logging.INFO) 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | class InputExample(object): 33 | """A single training/test example for simple sequence classification.""" 34 | 35 | def __init__(self, guid, text_a, text_b=None, label=None): 36 | """Constructs a InputExample. 37 | Args: 38 | guid: Unique id for the example. 39 | text_a: string. The untokenized text of the first sequence. For single 40 | sequence tasks, only this sequence must be specified. 41 | text_b: (Optional) string. The untokenized text of the second sequence. 42 | Only must be specified for sequence pair tasks. 43 | label: (Optional) string. The label of the example. This should be 44 | specified for train and dev examples, but not for test examples. 45 | """ 46 | self.guid = guid 47 | self.text_a = text_a 48 | self.text_b = text_b 49 | self.label = label 50 | 51 | 52 | class InputFeatures(object): 53 | """A single set of features of data.""" 54 | 55 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 56 | self.input_ids = input_ids 57 | self.input_mask = input_mask 58 | self.segment_ids = segment_ids 59 | self.label_id = label_id 60 | 61 | 62 | class DataProcessor(object): 63 | """Base class for data converters for sequence classification data sets.""" 64 | 65 | def get_train_examples(self, data_dir): 66 | """Gets a collection of `InputExample`s for the train set.""" 67 | raise NotImplementedError() 68 | 69 | def get_dev_examples(self, data_dir): 70 | """Gets a collection of `InputExample`s for the dev set.""" 71 | raise NotImplementedError() 72 | 73 | def get_test_examples(self, data_dir): 74 | """Gets a collection of `InputExample`s for the test set.""" 75 | raise NotImplementedError() 76 | 77 | def get_labels(self): 78 | """Gets the list of labels for this data set.""" 79 | raise NotImplementedError() 80 | 81 | @classmethod 82 | def _read_json(cls, input_file, quotechar=None): 83 | """Reads a tab separated value file.""" 84 | dicts = [] 85 | with codecs.open(input_file, 'r', 'utf-8') as infs: 86 | for inf in infs: 87 | inf = inf.strip() 88 | dicts.append(json.loads(inf)) 89 | return dicts 90 | 91 | 92 | class MyPro(DataProcessor): 93 | '''自定义数据读取方法,针对json文件 94 | 95 | Returns: 96 | examples: 数据集,包含index、中文文本、类别三个部分 97 | ''' 98 | def get_train_examples(self, data_dir): 99 | return self._create_examples( 100 | self._read_json(os.path.join(data_dir, "train.json")), 'train') 101 | 102 | def get_dev_examples(self, data_dir): 103 | return self._create_examples( 104 | self._read_json(os.path.join(data_dir, "val.json")), 'dev') 105 | 106 | def get_test_examples(self, data_dir): 107 | return self._create_examples( 108 | self._read_json(os.path.join(data_dir, "test.json")), 'test') 109 | 110 | def get_labels(self): 111 | return [0, 1] 112 | 113 | def _create_examples(self, dicts, set_type): 114 | examples = [] 115 | for (i, infor) in enumerate(dicts): 116 | guid = "%s-%s" % (set_type, i) 117 | text_a = infor['question'] 118 | label = infor['label'] 119 | examples.append( 120 | InputExample(guid=guid, text_a=text_a, label=label)) 121 | return examples 122 | 123 | 124 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, show_exp=True): 125 | '''Loads a data file into a list of `InputBatch`s. 126 | 127 | Args: 128 | examples : [List] 输入样本,包括question, label, index 129 | label_list : [List] 所有可能的类别,可以是int、str等,如['book', 'city', ...] 130 | max_seq_length: [int] 文本最大长度 131 | tokenizer : [Method] 分词方法 132 | 133 | Returns: 134 | features: 135 | input_ids : [ListOf] token的id,在chinese模式中就是每个分词的id,对应一个word vector 136 | input_mask : [ListOfInt] 真实字符对应1,补全字符对应0 137 | segment_ids: [ListOfInt] 句子标识符,第一句全为0,第二句全为1 138 | label_id : [ListOfInt] 将Label_list转化为相应的id表示 139 | ''' 140 | label_map = {} 141 | for (i, label) in enumerate(label_list): 142 | label_map[label] = i 143 | 144 | features = [] 145 | for (ex_index, example) in enumerate(examples): 146 | tokens_a = tokenizer.tokenize(example.text_a) 147 | 148 | tokens_b = None 149 | if example.text_b: 150 | tokens_b = tokenizer.tokenize(example.text_b) 151 | 152 | if tokens_b: 153 | # Modifies `tokens_a` and `tokens_b` in place so that the total 154 | # length is less than the specified length. 155 | # Account for [CLS], [SEP], [SEP] with "- 3" 156 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 157 | else: 158 | # Account for [CLS] and [SEP] with "- 2" 159 | if len(tokens_a) > max_seq_length - 2: 160 | tokens_a = tokens_a[0:(max_seq_length - 2)] 161 | 162 | # The convention in BERT is: 163 | # (a) For sequence pairs: 164 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 165 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 166 | # (b) For single sequences: 167 | # tokens: [CLS] the dog is hairy . [SEP] 168 | # type_ids: 0 0 0 0 0 0 0 169 | # 170 | # Where "type_ids" are used to indicate whether this is the first 171 | # sequence or the second sequence. The embedding vectors for `type=0` and 172 | # `type=1` were learned during pre-training and are added to the wordpiece 173 | # embedding vector (and position vector). This is not *strictly* necessary 174 | # since the [SEP] token unambigiously separates the sequences, but it makes 175 | # it easier for the model to learn the concept of sequences. 176 | # 177 | # For classification tasks, the first vector (corresponding to [CLS]) is 178 | # used as as the "sentence vector". Note that this only makes sense because 179 | # the entire model is fine-tuned. 180 | tokens = [] 181 | segment_ids = [] 182 | tokens.append("[CLS]") 183 | segment_ids.append(0) 184 | for token in tokens_a: 185 | tokens.append(token) 186 | segment_ids.append(0) 187 | tokens.append("[SEP]") 188 | segment_ids.append(0) 189 | 190 | if tokens_b: 191 | for token in tokens_b: 192 | tokens.append(token) 193 | segment_ids.append(1) 194 | tokens.append("[SEP]") 195 | segment_ids.append(1) 196 | 197 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 198 | 199 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 200 | # tokens are attended to. 201 | input_mask = [1] * len(input_ids) 202 | 203 | # Zero-pad up to the sequence length. 204 | while len(input_ids) < max_seq_length: 205 | input_ids.append(0) 206 | input_mask.append(0) 207 | segment_ids.append(0) 208 | 209 | assert len(input_ids) == max_seq_length 210 | assert len(input_mask) == max_seq_length 211 | assert len(segment_ids) == max_seq_length 212 | 213 | label_id = label_map[example.label] 214 | if ex_index < 5 and show_exp: 215 | logger.info("*** Example ***") 216 | logger.info("guid: %s" % (example.guid)) 217 | logger.info("tokens: %s" % " ".join( 218 | [str(x) for x in tokens])) 219 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 220 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 221 | logger.info( 222 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 223 | logger.info("label: %s (id = %d)" % (example.label, label_id)) 224 | 225 | features.append( 226 | InputFeatures(input_ids=input_ids, 227 | input_mask=input_mask, 228 | segment_ids=segment_ids, 229 | label_id=label_id)) 230 | return features 231 | 232 | 233 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 234 | """Truncates a sequence pair in place to the maximum length.""" 235 | 236 | # This is a simple heuristic which will always truncate the longer sequence 237 | # one token at a time. This makes more sense than truncating an equal percent 238 | # of tokens from each, since if one sequence is very short then each token 239 | # that's truncated likely contains more information than a longer sequence. 240 | while True: 241 | total_length = len(tokens_a) + len(tokens_b) 242 | if total_length <= max_length: 243 | break 244 | if len(tokens_a) > len(tokens_b): 245 | tokens_a.pop() 246 | else: 247 | tokens_b.pop() 248 | 249 | 250 | def accuracy(out, labels): 251 | outputs = np.argmax(out, axis=1) 252 | return np.sum(outputs == labels) 253 | 254 | 255 | def copy_optimizer_params_to_model(named_params_model, named_params_optimizer): 256 | """ Utility function for optimize_on_cpu and 16-bits training. 257 | Copy the parameters optimized on CPU/RAM back to the model on GPU 258 | """ 259 | for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): 260 | if name_opti != name_model: 261 | logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) 262 | raise ValueError 263 | param_model.data.copy_(param_opti.data) 264 | 265 | 266 | def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False): 267 | """ Utility function for optimize_on_cpu and 16-bits training. 268 | Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model 269 | """ 270 | is_nan = False 271 | for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): 272 | if name_opti != name_model: 273 | logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) 274 | raise ValueError 275 | if param_model.grad is not None: 276 | if test_nan and torch.isnan(param_model.grad).sum() > 0: 277 | is_nan = True 278 | if param_opti.grad is None: 279 | param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size())) 280 | param_opti.grad.data.copy_(param_model.grad.data) 281 | else: 282 | param_opti.grad = None 283 | return is_nan 284 | 285 | 286 | def val(model, processor, args, label_list, tokenizer, device): 287 | '''模型验证 288 | 289 | Args: 290 | model: 模型 291 | processor: 数据读取方法 292 | args: 参数表 293 | label_list: 所有可能类别 294 | tokenizer: 分词方法 295 | device 296 | 297 | Returns: 298 | f1: F1值 299 | ''' 300 | eval_examples = processor.get_dev_examples(args.data_dir) 301 | eval_features = convert_examples_to_features( 302 | eval_examples, label_list, args.max_seq_length, tokenizer, show_exp=False) 303 | all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) 304 | all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) 305 | all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) 306 | all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) 307 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 308 | # Run prediction for full data 309 | eval_sampler = SequentialSampler(eval_data) 310 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 311 | 312 | model.eval() 313 | predict = np.zeros((0,), dtype=np.int32) 314 | gt = np.zeros((0,), dtype=np.int32) 315 | for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: 316 | input_ids = input_ids.to(device) 317 | input_mask = input_mask.to(device) 318 | segment_ids = segment_ids.to(device) 319 | label_ids = label_ids.to(device) 320 | 321 | with torch.no_grad(): 322 | logits = model(input_ids, segment_ids, input_mask) 323 | pred = logits.max(1)[1] 324 | predict = np.hstack((predict, pred.cpu().numpy())) 325 | gt = np.hstack((gt, label_ids.cpu().numpy())) 326 | 327 | logits = logits.detach().cpu().numpy() 328 | label_ids = label_ids.to('cpu').numpy() 329 | 330 | print(len(gt)) 331 | f1 = np.mean(metrics.f1_score(predict, gt, average=None)) 332 | print(f1) 333 | 334 | return f1 335 | 336 | 337 | def test(model, processor, args, label_list, tokenizer, device): 338 | '''模型测试 339 | 340 | Args: 341 | model: 模型 342 | processor: 数据读取方法 343 | args: 参数表 344 | label_list: 所有可能类别 345 | tokenizer: 分词方法 346 | device 347 | 348 | Returns: 349 | f1: F1值 350 | ''' 351 | test_examples = processor.get_test_examples(args.data_dir) 352 | test_features = convert_examples_to_features( 353 | test_examples, label_list, args.max_seq_length, tokenizer) 354 | all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) 355 | all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) 356 | all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) 357 | all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) 358 | test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 359 | # Run prediction for full data 360 | test_sampler = SequentialSampler(test_data) 361 | test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) 362 | 363 | model.eval() 364 | predict = np.zeros((0,), dtype=np.int32) 365 | gt = np.zeros((0,), dtype=np.int32) 366 | for input_ids, input_mask, segment_ids, label_ids in test_dataloader: 367 | input_ids = input_ids.to(device) 368 | input_mask = input_mask.to(device) 369 | segment_ids = segment_ids.to(device) 370 | label_ids = label_ids.to(device) 371 | 372 | with torch.no_grad(): 373 | logits = model(input_ids, segment_ids, input_mask) 374 | pred = logits.max(1)[1] 375 | predict = np.hstack((predict, pred.cpu().numpy())) 376 | gt = np.hstack((gt, label_ids.cpu().numpy())) 377 | 378 | logits = logits.detach().cpu().numpy() 379 | label_ids = label_ids.to('cpu').numpy() 380 | 381 | f1 = np.mean(metrics.f1_score(predict, gt, average=None)) 382 | print('F1 score in text set is {}'.format(f1)) 383 | 384 | return f1 385 | 386 | 387 | def main(): 388 | # ArgumentParser对象保存了所有必要的信息,用以将命令行参数解析为相应的python数据类型 389 | parser = argparse.ArgumentParser() 390 | 391 | # required parameters 392 | # 调用add_argument()向ArgumentParser对象添加命令行参数信息,这些信息告诉ArgumentParser对象如何处理命令行参数 393 | parser.add_argument("--data_dir", 394 | default = '/data/users/zfsun3/text_classification/data/', 395 | type = str, 396 | #required = True, 397 | help = "The input data dir. Should contain the .tsv files (or other data files) for the task.") 398 | parser.add_argument("--bert_model", 399 | default = 'bert-base-chinese', 400 | type = str, 401 | #required = True, 402 | help = "choose [bert-base-chinese] mode.") 403 | parser.add_argument("--task_name", 404 | default = 'MyPro', 405 | type = str, 406 | #required = True, 407 | help = "The name of the task to train.") 408 | parser.add_argument("--output_dir", 409 | default = 'checkpoints/', 410 | type = str, 411 | #required = True, 412 | help = "The output directory where the model checkpoints will be written") 413 | parser.add_argument("--model_save_pth", 414 | default = 'checkpoints/bert_classification.pth', 415 | type = str, 416 | #required = True, 417 | help = "The output directory where the model checkpoints will be written") 418 | 419 | # other parameters 420 | parser.add_argument("--max_seq_length", 421 | default = 22, 422 | type = int, 423 | help = "字符串最大长度") 424 | parser.add_argument("--do_train", 425 | default = True, 426 | action = 'store_true', 427 | help = "训练模式") 428 | parser.add_argument("--do_eval", 429 | default = True, 430 | action = 'store_true', 431 | help = "验证模式") 432 | parser.add_argument("--do_lower_case", 433 | default = False, 434 | action = 'store_true', 435 | help = "英文字符的大小写转换,对于中文来说没啥用") 436 | parser.add_argument("--train_batch_size", 437 | default = 128, 438 | type = int, 439 | help = "训练时batch大小") 440 | parser.add_argument("--eval_batch_size", 441 | default = 1, 442 | type = int, 443 | help = "验证时batch大小") 444 | parser.add_argument("--learning_rate", 445 | default = 5e-5, 446 | type = float, 447 | help = "Adam初始学习步长") 448 | parser.add_argument("--num_train_epochs", 449 | default = 10.0, 450 | type = float, 451 | help = "训练的epochs次数") 452 | parser.add_argument("--warmup_proportion", 453 | default = 0.1, 454 | type = float, 455 | help = "Proportion of training to perform linear learning rate warmup for." 456 | "E.g., 0.1 = 10%% of training.") 457 | parser.add_argument("--no_cuda", 458 | default = False, 459 | action = 'store_true', 460 | help = "用不用CUDA") 461 | parser.add_argument("--local_rank", 462 | default = -1, 463 | type = int, 464 | help = "local_rank for distributed training on gpus.") 465 | parser.add_argument("--seed", 466 | default = 777, 467 | type = int, 468 | help = "初始化时的随机数种子") 469 | parser.add_argument("--gradient_accumulation_steps", 470 | default = 1, 471 | type = int, 472 | help = "Number of updates steps to accumulate before performing a backward/update pass.") 473 | parser.add_argument("--optimize_on_cpu", 474 | default = False, 475 | action = 'store_true', 476 | help = "Whether to perform optimization and keep the optimizer averages on CPU.") 477 | parser.add_argument("--fp16", 478 | default = False, 479 | action = 'store_true', 480 | help = "Whether to use 16-bit float precision instead of 32-bit.") 481 | parser.add_argument("--loss_scale", 482 | default = 128, 483 | type = float, 484 | help = "Loss scaling, positive power of 2 values can improve fp16 convergence.") 485 | 486 | args = parser.parse_args() 487 | 488 | # 对模型输入进行处理的processor,git上可能都是针对英文的processor 489 | processors = {'mypro': MyPro} 490 | 491 | if args.local_rank == -1 or args.no_cuda: 492 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 493 | n_gpu = torch.cuda.device_count() 494 | else: 495 | device = torch.device("cuda", args.local_rank) 496 | n_gpu = 1 497 | torch.distributed.init_process_group(backend='nccl') 498 | if args.fp16: 499 | logger.info("16-bits training currently not supported in distributed training") 500 | args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) 501 | logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) 502 | 503 | if args.gradient_accumulation_steps < 1: 504 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 505 | args.gradient_accumulation_steps)) 506 | 507 | args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) 508 | 509 | random.seed(args.seed) 510 | np.random.seed(args.seed) 511 | torch.manual_seed(args.seed) 512 | if n_gpu > 0: 513 | torch.cuda.manual_seed_all(args.seed) 514 | 515 | if not args.do_train and not args.do_eval: 516 | raise ValueError("At least one of `do_train` or `do_eval` must be True.") 517 | 518 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir): 519 | raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) 520 | os.makedirs(args.output_dir, exist_ok=True) 521 | 522 | task_name = args.task_name.lower() 523 | 524 | if task_name not in processors: 525 | raise ValueError("Task not found: %s" % (task_name)) 526 | 527 | processor = processors[task_name]() 528 | label_list = processor.get_labels() 529 | 530 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) 531 | 532 | train_examples = None 533 | num_train_steps = None 534 | if args.do_train: 535 | train_examples = processor.get_train_examples(args.data_dir) 536 | num_train_steps = int( 537 | len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) 538 | 539 | # Prepare model 540 | model = BertForSequenceClassification.from_pretrained(args.bert_model, 541 | cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) 542 | 543 | if args.fp16: 544 | model.half() 545 | model.to(device) 546 | if args.local_rank != -1: 547 | model = torch.nn.parallel.DistributedDataParallel(model, 548 | device_ids=[args.local_rank], 549 | output_device=args.local_rank) 550 | elif n_gpu > 1: 551 | model = torch.nn.DataParallel(model) 552 | 553 | # Prepare optimizer 554 | if args.fp16: 555 | param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ 556 | for n, param in model.named_parameters()] 557 | elif args.optimize_on_cpu: 558 | param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ 559 | for n, param in model.named_parameters()] 560 | else: 561 | param_optimizer = list(model.named_parameters()) 562 | no_decay = ['bias', 'gamma', 'beta'] 563 | optimizer_grouped_parameters = [ 564 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, 565 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} 566 | ] 567 | t_total = num_train_steps 568 | if args.local_rank != -1: 569 | t_total = t_total // torch.distributed.get_world_size() 570 | optimizer = BertAdam(optimizer_grouped_parameters, 571 | lr=args.learning_rate, 572 | warmup=args.warmup_proportion, 573 | t_total=t_total) 574 | 575 | global_step = 0 576 | if args.do_train: 577 | train_features = convert_examples_to_features( 578 | train_examples, label_list, args.max_seq_length, tokenizer, show_exp=False) 579 | logger.info("***** Running training *****") 580 | logger.info(" Num examples = %d", len(train_examples)) 581 | logger.info(" Batch size = %d", args.train_batch_size) 582 | logger.info(" Num steps = %d", num_train_steps) 583 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) 584 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) 585 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) 586 | all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) 587 | train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 588 | if args.local_rank == -1: 589 | train_sampler = RandomSampler(train_data) 590 | else: 591 | train_sampler = DistributedSampler(train_data) 592 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 593 | 594 | model.train() 595 | best_score = 0 596 | flags = 0 597 | for _ in trange(int(args.num_train_epochs), desc="Epoch"): 598 | for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): 599 | batch = tuple(t.to(device) for t in batch) 600 | input_ids, input_mask, segment_ids, label_ids = batch 601 | loss = model(input_ids, segment_ids, input_mask, label_ids) 602 | if n_gpu > 1: 603 | loss = loss.mean() # mean() to average on multi-gpu. 604 | if args.fp16 and args.loss_scale != 1.0: 605 | # rescale loss for fp16 training 606 | # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html 607 | loss = loss * args.loss_scale 608 | if args.gradient_accumulation_steps > 1: 609 | loss = loss / args.gradient_accumulation_steps 610 | loss.backward() 611 | 612 | if (step + 1) % args.gradient_accumulation_steps == 0: 613 | if args.fp16 or args.optimize_on_cpu: 614 | if args.fp16 and args.loss_scale != 1.0: 615 | # scale down gradients for fp16 training 616 | for param in model.parameters(): 617 | if param.grad is not None: 618 | param.grad.data = param.grad.data / args.loss_scale 619 | is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) 620 | if is_nan: 621 | logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") 622 | args.loss_scale = args.loss_scale / 2 623 | model.zero_grad() 624 | continue 625 | optimizer.step() 626 | copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) 627 | else: 628 | optimizer.step() 629 | model.zero_grad() 630 | 631 | f1 = val(model, processor, args, label_list, tokenizer, device) 632 | if f1 > best_score: 633 | best_score = f1 634 | print('*f1 score = {}'.format(f1)) 635 | flags = 0 636 | checkpoint = { 637 | 'state_dict': model.state_dict() 638 | } 639 | torch.save(checkpoint, args.model_save_pth) 640 | else: 641 | print('f1 score = {}'.format(f1)) 642 | flags += 1 643 | if flags >=6: 644 | break 645 | 646 | model.load_state_dict(torch.load(args.model_save_pth)['state_dict']) 647 | test(model, processor, args, label_list, tokenizer, device) 648 | 649 | 650 | if __name__ == '__main__': 651 | main() 652 | --------------------------------------------------------------------------------