├── README.md ├── img └── result.jpg ├── run_classifier.py ├── run_ner.py ├── run_predict.sh └── run_train.sh /README.md: -------------------------------------------------------------------------------- 1 | # bert examples 2 | 3 | #### 介绍 4 | `run_classifier.py` 是基于谷歌bert实现了[Quora Insincere Questions Classification](https://www.kaggle.com/c/quora-insincere-questions-classification)二分类比赛。 5 | `run_ner.py`是基于[瑞金医院AI大赛 第一赛季](https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.100066.0.0.1cdc33af6yLOXD&raceId=231687)数据和bert写的一个命名实体识别。 6 | 7 | #### 结果 8 | :blush:上述两个程序亲测都是可以跑通的,这里只贴下ner 验证集的效果。 9 | ![result](/img/result.jpg) 10 | -------------------------------------------------------------------------------- /img/result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pydataman/bert_examples/9dbe02f203a56ff2e473e14ed0d49bca7e3449a4/img/result.jpg -------------------------------------------------------------------------------- /run_classifier.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT finetuning runner.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import csv 23 | import os 24 | import modeling 25 | import optimization 26 | import tokenization 27 | import tensorflow as tf 28 | 29 | flags = tf.flags 30 | 31 | FLAGS = flags.FLAGS 32 | 33 | ## Required parameters 34 | flags.DEFINE_string( 35 | "data_dir", None, 36 | "The input data dir. Should contain the .tsv files (or other data files) " 37 | "for the task.") 38 | 39 | flags.DEFINE_string( 40 | "bert_config_file", None, 41 | "The config json file corresponding to the pre-trained BERT model. " 42 | "This specifies the model architecture.") 43 | 44 | flags.DEFINE_string("task_name", None, "The name of the task to train.") 45 | 46 | flags.DEFINE_string("vocab_file", None, 47 | "The vocabulary file that the BERT model was trained on.") 48 | 49 | flags.DEFINE_string( 50 | "output_dir", None, 51 | "The output directory where the model checkpoints will be written.") 52 | 53 | ## Other parameters 54 | 55 | flags.DEFINE_string( 56 | "init_checkpoint", None, 57 | "Initial checkpoint (usually from a pre-trained BERT model).") 58 | 59 | flags.DEFINE_bool( 60 | "do_lower_case", True, 61 | "Whether to lower case the input text. Should be True for uncased " 62 | "models and False for cased models.") 63 | 64 | flags.DEFINE_integer( 65 | "max_seq_length", 128, 66 | "The maximum total input sequence length after WordPiece tokenization. " 67 | "Sequences longer than this will be truncated, and sequences shorter " 68 | "than this will be padded.") 69 | 70 | flags.DEFINE_bool("do_train", False, "Whether to run training.") 71 | 72 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 73 | 74 | flags.DEFINE_bool( 75 | "do_predict", False, 76 | "Whether to run the model in inference mode on the test set.") 77 | 78 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") 79 | 80 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") 81 | 82 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") 83 | 84 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") 85 | 86 | flags.DEFINE_float("num_train_epochs", 3.0, 87 | "Total number of training epochs to perform.") 88 | 89 | flags.DEFINE_float( 90 | "warmup_proportion", 0.1, 91 | "Proportion of training to perform linear learning rate warmup for. " 92 | "E.g., 0.1 = 10% of training.") 93 | 94 | flags.DEFINE_integer("save_checkpoints_steps", 1000, 95 | "How often to save the model checkpoint.") 96 | 97 | flags.DEFINE_integer("iterations_per_loop", 1000, 98 | "How many steps to make in each estimator call.") 99 | 100 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 101 | 102 | tf.flags.DEFINE_string( 103 | "tpu_name", None, 104 | "The Cloud TPU to use for training. This should be either the name " 105 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 106 | "url.") 107 | 108 | tf.flags.DEFINE_string( 109 | "tpu_zone", None, 110 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 111 | "specified, we will attempt to automatically detect the GCE project from " 112 | "metadata.") 113 | 114 | tf.flags.DEFINE_string( 115 | "gcp_project", None, 116 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 117 | "specified, we will attempt to automatically detect the GCE project from " 118 | "metadata.") 119 | 120 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 121 | 122 | flags.DEFINE_integer( 123 | "num_tpu_cores", 8, 124 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 125 | 126 | 127 | class InputExample(object): 128 | """A single training/test example for simple sequence classification.""" 129 | 130 | def __init__(self, guid, text_a, text_b=None, label=None): 131 | """Constructs a InputExample. 132 | 133 | Args: 134 | guid: Unique id for the example. 135 | text_a: string. The untokenized text of the first sequence. For single 136 | sequence tasks, only this sequence must be specified. 137 | text_b: (Optional) string. The untokenized text of the second sequence. 138 | Only must be specified for sequence pair tasks. 139 | label: (Optional) string. The label of the example. This should be 140 | specified for train and dev examples, but not for test examples. 141 | """ 142 | self.guid = guid 143 | self.text_a = text_a 144 | self.text_b = text_b 145 | self.label = label 146 | 147 | 148 | class InputFeatures(object): 149 | """A single set of features of data.""" 150 | 151 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 152 | self.input_ids = input_ids 153 | self.input_mask = input_mask 154 | self.segment_ids = segment_ids 155 | self.label_id = label_id 156 | 157 | 158 | class DataProcessor(object): 159 | """Base class for data converters for sequence classification data sets.""" 160 | 161 | def get_train_examples(self, data_dir): 162 | """Gets a collection of `InputExample`s for the train set.""" 163 | raise NotImplementedError() 164 | 165 | def get_dev_examples(self, data_dir): 166 | """Gets a collection of `InputExample`s for the dev set.""" 167 | raise NotImplementedError() 168 | 169 | def get_test_examples(self, data_dir): 170 | """Gets a collection of `InputExample`s for prediction.""" 171 | raise NotImplementedError() 172 | 173 | def get_labels(self): 174 | """Gets the list of labels for this data set.""" 175 | raise NotImplementedError() 176 | 177 | @classmethod 178 | def _read_tsv(cls, input_file, quotechar=None): 179 | """Reads a tab separated value file.""" 180 | with tf.gfile.Open(input_file, "r") as f: 181 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 182 | lines = [] 183 | for line in reader: 184 | lines.append(line) 185 | return lines 186 | 187 | 188 | class XnliProcessor(DataProcessor): 189 | """Processor for the XNLI data set.""" 190 | 191 | def __init__(self): 192 | self.language = "zh" 193 | 194 | def get_train_examples(self, data_dir): 195 | """See base class.""" 196 | lines = self._read_tsv( 197 | os.path.join(data_dir, "multinli", 198 | "multinli.train.%s.tsv" % self.language)) 199 | examples = [] 200 | for (i, line) in enumerate(lines): 201 | if i == 0: 202 | continue 203 | guid = "train-%d" % (i) 204 | text_a = tokenization.convert_to_unicode(line[0]) 205 | text_b = tokenization.convert_to_unicode(line[1]) 206 | label = tokenization.convert_to_unicode(line[2]) 207 | if label == tokenization.convert_to_unicode("contradictory"): 208 | label = tokenization.convert_to_unicode("contradiction") 209 | examples.append( 210 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 211 | return examples 212 | 213 | def get_dev_examples(self, data_dir): 214 | """See base class.""" 215 | lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) 216 | examples = [] 217 | for (i, line) in enumerate(lines): 218 | if i == 0: 219 | continue 220 | guid = "dev-%d" % (i) 221 | language = tokenization.convert_to_unicode(line[0]) 222 | if language != tokenization.convert_to_unicode(self.language): 223 | continue 224 | text_a = tokenization.convert_to_unicode(line[6]) 225 | text_b = tokenization.convert_to_unicode(line[7]) 226 | label = tokenization.convert_to_unicode(line[1]) 227 | examples.append( 228 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 229 | return examples 230 | 231 | def get_labels(self): 232 | """See base class.""" 233 | return ["contradiction", "entailment", "neutral"] 234 | 235 | 236 | class MnliProcessor(DataProcessor): 237 | """Processor for the MultiNLI data set (GLUE version).""" 238 | 239 | def get_train_examples(self, data_dir): 240 | """See base class.""" 241 | return self._create_examples( 242 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 243 | 244 | def get_dev_examples(self, data_dir): 245 | """See base class.""" 246 | return self._create_examples( 247 | self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), 248 | "dev_matched") 249 | 250 | def get_test_examples(self, data_dir): 251 | """See base class.""" 252 | return self._create_examples( 253 | self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") 254 | 255 | def get_labels(self): 256 | """See base class.""" 257 | return ["contradiction", "entailment", "neutral"] 258 | 259 | def _create_examples(self, lines, set_type): 260 | """Creates examples for the training and dev sets.""" 261 | examples = [] 262 | for (i, line) in enumerate(lines): 263 | if i == 0: 264 | continue 265 | guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) 266 | text_a = tokenization.convert_to_unicode(line[8]) 267 | text_b = tokenization.convert_to_unicode(line[9]) 268 | if set_type == "test": 269 | label = "contradiction" 270 | else: 271 | label = tokenization.convert_to_unicode(line[-1]) 272 | examples.append( 273 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 274 | return examples 275 | 276 | 277 | class MrpcProcessor(DataProcessor): 278 | """Processor for the MRPC data set (GLUE version).""" 279 | 280 | def get_train_examples(self, data_dir): 281 | """See base class.""" 282 | return self._create_examples( 283 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 284 | 285 | def get_dev_examples(self, data_dir): 286 | """See base class.""" 287 | return self._create_examples( 288 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 289 | 290 | def get_test_examples(self, data_dir): 291 | """See base class.""" 292 | return self._create_examples( 293 | self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") 294 | 295 | def get_labels(self): 296 | """See base class.""" 297 | return ["0", "1"] 298 | 299 | def _create_examples(self, lines, set_type): 300 | """Creates examples for the training and dev sets.""" 301 | examples = [] 302 | for (i, line) in enumerate(lines): 303 | if i == 0: 304 | continue 305 | guid = "%s-%s" % (set_type, i) 306 | text_a = tokenization.convert_to_unicode(line[3]) 307 | text_b = tokenization.convert_to_unicode(line[4]) 308 | if set_type == "test": 309 | label = "0" 310 | else: 311 | label = tokenization.convert_to_unicode(line[0]) 312 | examples.append( 313 | InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 314 | return examples 315 | 316 | 317 | class ColaProcessor(DataProcessor): 318 | """Processor for the CoLA data set (GLUE version).""" 319 | 320 | def get_train_examples(self, data_dir): 321 | """See base class.""" 322 | return self._create_examples( 323 | self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") 324 | 325 | def get_dev_examples(self, data_dir): 326 | """See base class.""" 327 | return self._create_examples( 328 | self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") 329 | 330 | def get_test_examples(self, data_dir): 331 | """See base class.""" 332 | return self._create_examples( 333 | self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") 334 | 335 | def get_labels(self): 336 | """See base class.""" 337 | return ["0", "1"] 338 | 339 | def _create_examples(self, lines, set_type): 340 | """Creates examples for the training and dev sets.""" 341 | examples = [] 342 | for (i, line) in enumerate(lines): 343 | # Only the test set has a header 344 | if set_type == "test" and i == 0: 345 | continue 346 | guid = "%s-%s" % (set_type, i) 347 | if set_type == "test": 348 | text_a = tokenization.convert_to_unicode(line[1]) 349 | label = "0" 350 | else: 351 | text_a = tokenization.convert_to_unicode(line[3]) 352 | label = tokenization.convert_to_unicode(line[1]) 353 | examples.append( 354 | InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) 355 | return examples 356 | 357 | 358 | class QuoraProcessor(DataProcessor): 359 | """processor for kaggle quora""" 360 | def get_train_examples(self, data_dir): 361 | return self._create_examples( 362 | self._read_tsv(os.path.join(data_dir, "train.csv")), "train" 363 | ) 364 | 365 | def get_dev_examples(self, data_dir): 366 | return self._create_examples( 367 | self._read_tsv(os.path.join(data_dir, "dev.csv")), "dev" 368 | ) 369 | 370 | def get_test_examples(self, data_dir): 371 | return self._create_examples( 372 | self._read_tsv(os.path.join(data_dir, "test.csv")), "test" 373 | ) 374 | 375 | def get_labels(self): 376 | return ["0", "1"] 377 | 378 | def _create_examples(self, lines, set_type): 379 | examples = [] 380 | for (i, line) in enumerate(lines): 381 | if i == 0: 382 | continue 383 | guid = "%s-%s" % (set_type, i) 384 | if set_type == "test": 385 | if len(line) != 2: 386 | continue 387 | text_a = tokenization.convert_to_unicode(line[1]) 388 | label = "0" 389 | else: 390 | if len(line) != 3: 391 | continue 392 | text_a = tokenization.convert_to_unicode(line[1]) 393 | label = tokenization.convert_to_unicode(line[2]) 394 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) 395 | return examples 396 | 397 | 398 | def convert_single_example(ex_index, example, label_list, max_seq_length, 399 | tokenizer): 400 | """Converts a single `InputExample` into a single `InputFeatures`.""" 401 | label_map = {} 402 | for (i, label) in enumerate(label_list): 403 | label_map[label] = i 404 | 405 | tokens_a = tokenizer.tokenize(example.text_a) 406 | tokens_b = None 407 | if example.text_b: 408 | tokens_b = tokenizer.tokenize(example.text_b) 409 | 410 | if tokens_b: 411 | # Modifies `tokens_a` and `tokens_b` in place so that the total 412 | # length is less than the specified length. 413 | # Account for [CLS], [SEP], [SEP] with "- 3" 414 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 415 | else: 416 | # Account for [CLS] and [SEP] with "- 2" 417 | if len(tokens_a) > max_seq_length - 2: 418 | tokens_a = tokens_a[0:(max_seq_length - 2)] 419 | 420 | # The convention in BERT is: 421 | # (a) For sequence pairs: 422 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 423 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 424 | # (b) For single sequences: 425 | # tokens: [CLS] the dog is hairy . [SEP] 426 | # type_ids: 0 0 0 0 0 0 0 427 | # 428 | # Where "type_ids" are used to indicate whether this is the first 429 | # sequence or the second sequence. The embedding vectors for `type=0` and 430 | # `type=1` were learned during pre-training and are added to the wordpiece 431 | # embedding vector (and position vector). This is not *strictly* necessary 432 | # since the [SEP] token unambiguously separates the sequences, but it makes 433 | # it easier for the model to learn the concept of sequences. 434 | # 435 | # For classification tasks, the first vector (corresponding to [CLS]) is 436 | # used as as the "sentence vector". Note that this only makes sense because 437 | # the entire model is fine-tuned. 438 | tokens = [] 439 | segment_ids = [] 440 | tokens.append("[CLS]") 441 | segment_ids.append(0) 442 | for token in tokens_a: 443 | tokens.append(token) 444 | segment_ids.append(0) 445 | tokens.append("[SEP]") 446 | segment_ids.append(0) 447 | 448 | if tokens_b: 449 | for token in tokens_b: 450 | tokens.append(token) 451 | segment_ids.append(1) 452 | tokens.append("[SEP]") 453 | segment_ids.append(1) 454 | 455 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 456 | 457 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 458 | # tokens are attended to. 459 | input_mask = [1] * len(input_ids) 460 | 461 | # Zero-pad up to the sequence length. 462 | while len(input_ids) < max_seq_length: 463 | input_ids.append(0) 464 | input_mask.append(0) 465 | segment_ids.append(0) 466 | 467 | assert len(input_ids) == max_seq_length 468 | assert len(input_mask) == max_seq_length 469 | assert len(segment_ids) == max_seq_length 470 | 471 | label_id = label_map[example.label] 472 | if ex_index < 5: 473 | tf.logging.info("*** Example ***") 474 | tf.logging.info("guid: %s" % (example.guid)) 475 | tf.logging.info("tokens: %s" % " ".join( 476 | [tokenization.printable_text(x) for x in tokens])) 477 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 478 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 479 | tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 480 | tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) 481 | 482 | feature = InputFeatures( 483 | input_ids=input_ids, 484 | input_mask=input_mask, 485 | segment_ids=segment_ids, 486 | label_id=label_id) 487 | return feature 488 | 489 | 490 | def file_based_convert_examples_to_features( 491 | examples, label_list, max_seq_length, tokenizer, output_file): 492 | """Convert a set of `InputExample`s to a TFRecord file.""" 493 | 494 | writer = tf.python_io.TFRecordWriter(output_file) 495 | 496 | for (ex_index, example) in enumerate(examples): 497 | if ex_index % 10000 == 0: 498 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 499 | 500 | feature = convert_single_example(ex_index, example, label_list, 501 | max_seq_length, tokenizer) 502 | 503 | def create_int_feature(values): 504 | f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 505 | return f 506 | 507 | features = collections.OrderedDict() 508 | features["input_ids"] = create_int_feature(feature.input_ids) 509 | features["input_mask"] = create_int_feature(feature.input_mask) 510 | features["segment_ids"] = create_int_feature(feature.segment_ids) 511 | features["label_ids"] = create_int_feature([feature.label_id]) 512 | 513 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 514 | writer.write(tf_example.SerializeToString()) 515 | 516 | 517 | def file_based_input_fn_builder(input_file, seq_length, is_training, 518 | drop_remainder): 519 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 520 | 521 | name_to_features = { 522 | "input_ids": tf.FixedLenFeature([seq_length], tf.int64), 523 | "input_mask": tf.FixedLenFeature([seq_length], tf.int64), 524 | "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), 525 | "label_ids": tf.FixedLenFeature([], tf.int64), 526 | } 527 | 528 | def _decode_record(record, name_to_features): 529 | """Decodes a record to a TensorFlow example.""" 530 | example = tf.parse_single_example(record, name_to_features) 531 | 532 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 533 | # So cast all int64 to int32. 534 | for name in list(example.keys()): 535 | t = example[name] 536 | if t.dtype == tf.int64: 537 | t = tf.to_int32(t) 538 | example[name] = t 539 | 540 | return example 541 | 542 | def input_fn(params): 543 | """The actual input function.""" 544 | batch_size = params["batch_size"] 545 | 546 | # For training, we want a lot of parallel reading and shuffling. 547 | # For eval, we want no shuffling and parallel reading doesn't matter. 548 | d = tf.data.TFRecordDataset(input_file) 549 | if is_training: 550 | d = d.repeat() 551 | d = d.shuffle(buffer_size=100) 552 | 553 | d = d.apply( 554 | tf.contrib.data.map_and_batch( 555 | lambda record: _decode_record(record, name_to_features), 556 | batch_size=batch_size, 557 | drop_remainder=drop_remainder)) 558 | 559 | return d 560 | 561 | return input_fn 562 | 563 | 564 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 565 | """Truncates a sequence pair in place to the maximum length.""" 566 | 567 | # This is a simple heuristic which will always truncate the longer sequence 568 | # one token at a time. This makes more sense than truncating an equal percent 569 | # of tokens from each, since if one sequence is very short then each token 570 | # that's truncated likely contains more information than a longer sequence. 571 | while True: 572 | total_length = len(tokens_a) + len(tokens_b) 573 | if total_length <= max_length: 574 | break 575 | if len(tokens_a) > len(tokens_b): 576 | tokens_a.pop() 577 | else: 578 | tokens_b.pop() 579 | 580 | 581 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 582 | labels, num_labels, use_one_hot_embeddings): 583 | """Creates a classification model.""" 584 | model = modeling.BertModel( 585 | config=bert_config, 586 | is_training=is_training, 587 | input_ids=input_ids, 588 | input_mask=input_mask, 589 | token_type_ids=segment_ids, 590 | use_one_hot_embeddings=use_one_hot_embeddings) 591 | 592 | # In the demo, we are doing a simple classification task on the entire 593 | # segment. 594 | # 595 | # If you want to use the token-level output, use model.get_sequence_output() 596 | # instead. 597 | output_layer = model.get_pooled_output() 598 | 599 | hidden_size = output_layer.shape[-1].value 600 | 601 | output_weights = tf.get_variable( 602 | "output_weights", [num_labels, hidden_size], 603 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 604 | 605 | output_bias = tf.get_variable( 606 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 607 | 608 | with tf.variable_scope("loss"): 609 | if is_training: 610 | # I.e., 0.1 dropout 611 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 612 | 613 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 614 | logits = tf.nn.bias_add(logits, output_bias) 615 | probabilities = tf.nn.softmax(logits, axis=-1) 616 | log_probs = tf.nn.log_softmax(logits, axis=-1) 617 | 618 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 619 | 620 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 621 | loss = tf.reduce_mean(per_example_loss) 622 | 623 | return (loss, per_example_loss, logits, probabilities) 624 | 625 | 626 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, 627 | num_train_steps, num_warmup_steps, use_tpu, 628 | use_one_hot_embeddings): 629 | """Returns `model_fn` closure for TPUEstimator.""" 630 | 631 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 632 | """The `model_fn` for TPUEstimator.""" 633 | 634 | tf.logging.info("*** Features ***") 635 | for name in sorted(features.keys()): 636 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 637 | 638 | input_ids = features["input_ids"] 639 | input_mask = features["input_mask"] 640 | segment_ids = features["segment_ids"] 641 | label_ids = features["label_ids"] 642 | 643 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 644 | 645 | (total_loss, per_example_loss, logits, probabilities) = create_model( 646 | bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, 647 | num_labels, use_one_hot_embeddings) 648 | 649 | tvars = tf.trainable_variables() 650 | 651 | scaffold_fn = None 652 | if init_checkpoint: 653 | (assignment_map, initialized_variable_names 654 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 655 | if use_tpu: 656 | 657 | def tpu_scaffold(): 658 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 659 | return tf.train.Scaffold() 660 | 661 | scaffold_fn = tpu_scaffold 662 | else: 663 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 664 | 665 | tf.logging.info("**** Trainable Variables ****") 666 | for var in tvars: 667 | init_string = "" 668 | if var.name in initialized_variable_names: 669 | init_string = ", *INIT_FROM_CKPT*" 670 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 671 | init_string) 672 | 673 | output_spec = None 674 | if mode == tf.estimator.ModeKeys.TRAIN: 675 | 676 | train_op = optimization.create_optimizer( 677 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 678 | 679 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 680 | mode=mode, 681 | loss=total_loss, 682 | train_op=train_op, 683 | scaffold_fn=scaffold_fn) 684 | elif mode == tf.estimator.ModeKeys.EVAL: 685 | 686 | def metric_fn(per_example_loss, label_ids, logits): 687 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 688 | accuracy = tf.metrics.accuracy(label_ids, predictions) 689 | loss = tf.metrics.mean(per_example_loss) 690 | return { 691 | "eval_accuracy": accuracy, 692 | "eval_loss": loss, 693 | } 694 | 695 | eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) 696 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 697 | mode=mode, 698 | loss=total_loss, 699 | eval_metrics=eval_metrics, 700 | scaffold_fn=scaffold_fn) 701 | else: 702 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 703 | mode=mode, predictions=probabilities, scaffold_fn=scaffold_fn) 704 | return output_spec 705 | 706 | return model_fn 707 | 708 | 709 | # This function is not used by this file but is still used by the Colab and 710 | # people who depend on it. 711 | def input_fn_builder(features, seq_length, is_training, drop_remainder): 712 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 713 | 714 | all_input_ids = [] 715 | all_input_mask = [] 716 | all_segment_ids = [] 717 | all_label_ids = [] 718 | 719 | for feature in features: 720 | all_input_ids.append(feature.input_ids) 721 | all_input_mask.append(feature.input_mask) 722 | all_segment_ids.append(feature.segment_ids) 723 | all_label_ids.append(feature.label_id) 724 | 725 | def input_fn(params): 726 | """The actual input function.""" 727 | batch_size = params["batch_size"] 728 | 729 | num_examples = len(features) 730 | 731 | # This is for demo purposes and does NOT scale to large data sets. We do 732 | # not use Dataset.from_generator() because that uses tf.py_func which is 733 | # not TPU compatible. The right way to load data is with TFRecordReader. 734 | d = tf.data.Dataset.from_tensor_slices({ 735 | "input_ids": 736 | tf.constant( 737 | all_input_ids, shape=[num_examples, seq_length], 738 | dtype=tf.int32), 739 | "input_mask": 740 | tf.constant( 741 | all_input_mask, 742 | shape=[num_examples, seq_length], 743 | dtype=tf.int32), 744 | "segment_ids": 745 | tf.constant( 746 | all_segment_ids, 747 | shape=[num_examples, seq_length], 748 | dtype=tf.int32), 749 | "label_ids": 750 | tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), 751 | }) 752 | 753 | if is_training: 754 | d = d.repeat() 755 | d = d.shuffle(buffer_size=100) 756 | 757 | d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) 758 | return d 759 | 760 | return input_fn 761 | 762 | 763 | # This function is not used by this file but is still used by the Colab and 764 | # people who depend on it. 765 | def convert_examples_to_features(examples, label_list, max_seq_length, 766 | tokenizer): 767 | """Convert a set of `InputExample`s to a list of `InputFeatures`.""" 768 | 769 | features = [] 770 | for (ex_index, example) in enumerate(examples): 771 | if ex_index % 10000 == 0: 772 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 773 | 774 | feature = convert_single_example(ex_index, example, label_list, 775 | max_seq_length, tokenizer) 776 | 777 | features.append(feature) 778 | return features 779 | 780 | 781 | def main(_): 782 | tf.logging.set_verbosity(tf.logging.INFO) 783 | 784 | processors = { 785 | "cola": ColaProcessor, 786 | "mnli": MnliProcessor, 787 | "mrpc": MrpcProcessor, 788 | "xnli": XnliProcessor, 789 | "kaggle-quora": QuoraProcessor, 790 | } 791 | 792 | if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: 793 | raise ValueError( 794 | "At least one of `do_train`, `do_eval` or `do_predict' must be True.") 795 | 796 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 797 | 798 | if FLAGS.max_seq_length > bert_config.max_position_embeddings: 799 | raise ValueError( 800 | "Cannot use sequence length %d because the BERT model " 801 | "was only trained up to sequence length %d" % 802 | (FLAGS.max_seq_length, bert_config.max_position_embeddings)) 803 | 804 | tf.gfile.MakeDirs(FLAGS.output_dir) 805 | 806 | task_name = FLAGS.task_name.lower() 807 | 808 | if task_name not in processors: 809 | raise ValueError("Task not found: %s" % (task_name)) 810 | 811 | processor = processors[task_name]() 812 | 813 | label_list = processor.get_labels() 814 | 815 | tokenizer = tokenization.FullTokenizer( 816 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 817 | 818 | tpu_cluster_resolver = None 819 | if FLAGS.use_tpu and FLAGS.tpu_name: 820 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( 821 | FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) 822 | 823 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 824 | run_config = tf.contrib.tpu.RunConfig( 825 | cluster=tpu_cluster_resolver, 826 | master=FLAGS.master, 827 | model_dir=FLAGS.output_dir, 828 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 829 | tpu_config=tf.contrib.tpu.TPUConfig( 830 | iterations_per_loop=FLAGS.iterations_per_loop, 831 | num_shards=FLAGS.num_tpu_cores, 832 | per_host_input_for_training=is_per_host)) 833 | 834 | train_examples = None 835 | num_train_steps = None 836 | num_warmup_steps = None 837 | if FLAGS.do_train: 838 | train_examples = processor.get_train_examples(FLAGS.data_dir) 839 | num_train_steps = int( 840 | len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) 841 | num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) 842 | 843 | model_fn = model_fn_builder( 844 | bert_config=bert_config, 845 | num_labels=len(label_list), 846 | init_checkpoint=FLAGS.init_checkpoint, 847 | learning_rate=FLAGS.learning_rate, 848 | num_train_steps=num_train_steps, 849 | num_warmup_steps=num_warmup_steps, 850 | use_tpu=FLAGS.use_tpu, 851 | use_one_hot_embeddings=FLAGS.use_tpu) 852 | 853 | # If TPU is not available, this will fall back to normal Estimator on CPU 854 | # or GPU. 855 | estimator = tf.contrib.tpu.TPUEstimator( 856 | use_tpu=FLAGS.use_tpu, 857 | model_fn=model_fn, 858 | config=run_config, 859 | train_batch_size=FLAGS.train_batch_size, 860 | eval_batch_size=FLAGS.eval_batch_size, 861 | predict_batch_size=FLAGS.predict_batch_size) 862 | 863 | if FLAGS.do_train: 864 | train_file = os.path.join(FLAGS.output_dir, "train.tf_record") 865 | file_based_convert_examples_to_features( 866 | train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) 867 | tf.logging.info("***** Running training *****") 868 | tf.logging.info(" Num examples = %d", len(train_examples)) 869 | tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) 870 | tf.logging.info(" Num steps = %d", num_train_steps) 871 | train_input_fn = file_based_input_fn_builder( 872 | input_file=train_file, 873 | seq_length=FLAGS.max_seq_length, 874 | is_training=True, 875 | drop_remainder=True) 876 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 877 | 878 | if FLAGS.do_eval: 879 | eval_examples = processor.get_dev_examples(FLAGS.data_dir) 880 | eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") 881 | file_based_convert_examples_to_features( 882 | eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) 883 | 884 | tf.logging.info("***** Running evaluation *****") 885 | tf.logging.info(" Num examples = %d", len(eval_examples)) 886 | tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) 887 | 888 | # This tells the estimator to run through the entire set. 889 | eval_steps = None 890 | # However, if running eval on the TPU, you will need to specify the 891 | # number of steps. 892 | if FLAGS.use_tpu: 893 | # Eval will be slightly WRONG on the TPU because it will truncate 894 | # the last batch. 895 | eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) 896 | 897 | eval_drop_remainder = True if FLAGS.use_tpu else False 898 | eval_input_fn = file_based_input_fn_builder( 899 | input_file=eval_file, 900 | seq_length=FLAGS.max_seq_length, 901 | is_training=False, 902 | drop_remainder=eval_drop_remainder) 903 | 904 | result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 905 | 906 | output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 907 | with tf.gfile.GFile(output_eval_file, "w") as writer: 908 | tf.logging.info("***** Eval results *****") 909 | for key in sorted(result.keys()): 910 | tf.logging.info(" %s = %s", key, str(result[key])) 911 | writer.write("%s = %s\n" % (key, str(result[key]))) 912 | 913 | if FLAGS.do_predict: 914 | predict_examples = processor.get_test_examples(FLAGS.data_dir) 915 | predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") 916 | file_based_convert_examples_to_features(predict_examples, label_list, 917 | FLAGS.max_seq_length, tokenizer, 918 | predict_file) 919 | 920 | tf.logging.info("***** Running prediction*****") 921 | tf.logging.info(" Num examples = %d", len(predict_examples)) 922 | tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) 923 | 924 | if FLAGS.use_tpu: 925 | # Warning: According to tpu_estimator.py Prediction on TPU is an 926 | # experimental feature and hence not supported here 927 | raise ValueError("Prediction in TPU not supported") 928 | 929 | predict_drop_remainder = True if FLAGS.use_tpu else False 930 | predict_input_fn = file_based_input_fn_builder( 931 | input_file=predict_file, 932 | seq_length=FLAGS.max_seq_length, 933 | is_training=False, 934 | drop_remainder=predict_drop_remainder) 935 | 936 | result = estimator.predict(input_fn=predict_input_fn) 937 | 938 | output_predict_file = os.path.join(FLAGS.output_dir, "test_results.csv") 939 | with tf.gfile.GFile(output_predict_file, "w") as writer: 940 | tf.logging.info("***** Predict results *****") 941 | for prediction in result: 942 | ''' 943 | 输出0,1的概率 944 | ''' 945 | output_line = "\t".join( 946 | str(class_probability) for class_probability in prediction) + "\n" 947 | writer.write(output_line) 948 | 949 | 950 | if __name__ == "__main__": 951 | flags.mark_flag_as_required("data_dir") 952 | flags.mark_flag_as_required("task_name") 953 | flags.mark_flag_as_required("vocab_file") 954 | flags.mark_flag_as_required("bert_config_file") 955 | flags.mark_flag_as_required("output_dir") 956 | tf.app.run() 957 | -------------------------------------------------------------------------------- /run_ner.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """BERT finetuning runner.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import csv 23 | import os 24 | import modeling 25 | import optimization 26 | import tokenization 27 | import tensorflow as tf 28 | from sklearn.metrics import f1_score, precision_score, recall_score 29 | import codecs 30 | 31 | flags = tf.flags 32 | 33 | FLAGS = flags.FLAGS 34 | 35 | ## Required parameters 36 | flags.DEFINE_string( 37 | "data_dir", None, 38 | "The input data dir. Should contain the .tsv files (or other data files) " 39 | "for the task.") 40 | 41 | flags.DEFINE_string( 42 | "bert_config_file", None, 43 | "The config json file corresponding to the pre-trained BERT model. " 44 | "This specifies the model architecture.") 45 | 46 | flags.DEFINE_string("task_name", None, "The name of the task to train.") 47 | 48 | flags.DEFINE_string("vocab_file", None, 49 | "The vocabulary file that the BERT model was trained on.") 50 | 51 | flags.DEFINE_string( 52 | "output_dir", None, 53 | "The output directory where the model checkpoints will be written.") 54 | 55 | ## Other parameters 56 | 57 | flags.DEFINE_string( 58 | "init_checkpoint", None, 59 | "Initial checkpoint (usually from a pre-trained BERT model).") 60 | 61 | flags.DEFINE_bool( 62 | "do_lower_case", True, 63 | "Whether to lower case the input text. Should be True for uncased " 64 | "models and False for cased models.") 65 | 66 | flags.DEFINE_integer( 67 | "max_seq_length", 128, 68 | "The maximum total input sequence length after WordPiece tokenization. " 69 | "Sequences longer than this will be truncated, and sequences shorter " 70 | "than this will be padded.") 71 | 72 | flags.DEFINE_bool("do_train", False, "Whether to run training.") 73 | 74 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") 75 | 76 | flags.DEFINE_bool( 77 | "do_predict", False, 78 | "Whether to run the model in inference mode on the test set.") 79 | 80 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") 81 | 82 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") 83 | 84 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") 85 | 86 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") 87 | 88 | flags.DEFINE_float("num_train_epochs", 3.0, 89 | "Total number of training epochs to perform.") 90 | 91 | flags.DEFINE_float( 92 | "warmup_proportion", 0.1, 93 | "Proportion of training to perform linear learning rate warmup for. " 94 | "E.g., 0.1 = 10% of training.") 95 | 96 | flags.DEFINE_integer("save_checkpoints_steps", 1000, 97 | "How often to save the model checkpoint.") 98 | 99 | flags.DEFINE_integer("iterations_per_loop", 1000, 100 | "How many steps to make in each estimator call.") 101 | 102 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 103 | 104 | tf.flags.DEFINE_string( 105 | "tpu_name", None, 106 | "The Cloud TPU to use for training. This should be either the name " 107 | "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " 108 | "url.") 109 | 110 | tf.flags.DEFINE_string( 111 | "tpu_zone", None, 112 | "[Optional] GCE zone where the Cloud TPU is located in. If not " 113 | "specified, we will attempt to automatically detect the GCE project from " 114 | "metadata.") 115 | 116 | tf.flags.DEFINE_string( 117 | "gcp_project", None, 118 | "[Optional] Project name for the Cloud TPU-enabled project. If not " 119 | "specified, we will attempt to automatically detect the GCE project from " 120 | "metadata.") 121 | 122 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") 123 | 124 | flags.DEFINE_integer( 125 | "num_tpu_cores", 8, 126 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 127 | 128 | 129 | class InputExample(object): 130 | """A single training/test example for simple sequence classification.""" 131 | 132 | def __init__(self, guid, text, label=None): 133 | """Constructs a InputExample. 134 | 135 | Args: 136 | guid: Unique id for the example. 137 | text: string. The untokenized text of the first sequence. For single 138 | sequence tasks, only this sequence must be specified. 139 | label: (Optional) string. The label of the example. This should be 140 | specified for train and dev examples, but not for test examples. 141 | """ 142 | self.guid = guid 143 | self.text = text 144 | self.label = label 145 | 146 | 147 | class InputFeatures(object): 148 | """A single set of features of data.""" 149 | 150 | def __init__(self, input_ids, input_mask, segment_ids, label_ids): 151 | self.input_ids = input_ids 152 | self.input_mask = input_mask 153 | self.segment_ids = segment_ids 154 | self.label_ids = label_ids 155 | 156 | 157 | class DataProcessor(object): 158 | """Base class for data converters for sequence classification data sets.""" 159 | 160 | def get_train_examples(self, data_dir): 161 | """Gets a collection of `InputExample`s for the train set.""" 162 | raise NotImplementedError() 163 | 164 | def get_dev_examples(self, data_dir): 165 | """Gets a collection of `InputExample`s for the dev set.""" 166 | raise NotImplementedError() 167 | 168 | def get_test_examples(self, data_dir): 169 | """Gets a collection of `InputExample`s for prediction.""" 170 | raise NotImplementedError() 171 | 172 | def get_labels(self): 173 | """Gets the list of labels for this data set.""" 174 | raise NotImplementedError() 175 | 176 | @classmethod 177 | def _read_tsv(cls, input_file, quotechar=None): 178 | """BIO""" 179 | # with tf.gfile.Open(input_file, "r") as f: 180 | # reader = csv.reader(f, delimiter=" ", quotechar=quotechar) 181 | # lines = [] 182 | # for line in reader: 183 | # lines.append(line) 184 | # return lines 185 | with codecs.open(input_file, "r", encoding="utf-8") as f: 186 | lines = [] 187 | words = [] 188 | labels = [] 189 | for line in f: 190 | content = line.strip() 191 | word = line.strip().split("\t")[0] 192 | if len(line.strip().split()) > 1: 193 | label = line.strip().split("\t")[1] 194 | else: 195 | label = "O" 196 | if len(content) == 0 and words[-1] == "。": 197 | l = " ".join([label for label in labels if len(label) > 0]) 198 | w = " ".join([word for word in words if len(word) > 0]) 199 | lines.append([w, l]) 200 | words = [] 201 | labels = [] 202 | continue 203 | words.append(word) 204 | labels.append(label) 205 | return lines 206 | 207 | 208 | class NerProcessor(DataProcessor): 209 | """processor for kaggle quora""" 210 | 211 | def get_train_examples(self, data_dir): 212 | return self._create_examples( 213 | self._read_tsv(os.path.join(data_dir, "train.csv")), "train" 214 | ) 215 | 216 | def get_dev_examples(self, data_dir): 217 | return self._create_examples( 218 | self._read_tsv(os.path.join(data_dir, "dev.csv")), "dev" 219 | ) 220 | 221 | def get_test_examples(self, data_dir): 222 | return self._create_examples( 223 | self._read_tsv(os.path.join(data_dir, "test.csv")), "test" 224 | ) 225 | 226 | def get_labels(self): 227 | ''' 228 | ner class 229 | :return: 230 | ''' 231 | return ["X", 'O', 'B-Disease', 'I-Disease', 'B-Reason', 'I-Reason', "B-Symptom", "I-Symptom", "B-Test", 232 | "I-Test", 233 | "B-Test_Value", "I-Test_Value", "B-Drug", "I-Drug", "B-Frequency", "I-Frequency", "B-Amount", 234 | "I-Amount", 235 | "B-Treatment", "I-Treatment", "B-Operation", "I-Operation", "B-Method", "I-Method", "B-SideEff", 236 | "I-SideEff", "B-Anatomy", "I-Anatomy", "B-Level", "I-Level", "B-Duration", "I-Duration"] 237 | 238 | def _create_examples(self, lines, set_type): 239 | examples = [] 240 | label_list = [] 241 | for (i, line) in enumerate(lines): 242 | guid = "%s-%s" % (set_type, i) 243 | 244 | text = tokenization.convert_to_unicode(line[0]) 245 | label = tokenization.convert_to_unicode(line[1]) 246 | label_list.append(label) 247 | examples.append(InputExample(guid=guid, text=text, label=label)) 248 | 249 | if set_type == "test": 250 | return examples 251 | else: 252 | return examples, label_list 253 | 254 | 255 | def convert_single_example(ex_index, example, label_list, max_seq_length, 256 | tokenizer): 257 | """Converts a single `InputExample` into a single `InputFeatures`.""" 258 | label_map = {} 259 | for (i, label) in enumerate(label_list): 260 | label_map[label] = i 261 | text_list = example.text.split(" ") 262 | labellist = example.label.split(" ") 263 | tokens = [] 264 | labels = [] 265 | for i, word in enumerate(text_list): 266 | token = tokenizer.tokenize(word) 267 | tokens.extend(token) 268 | label_ = labellist[i] 269 | for n in range(len(token)): 270 | if n == 0: 271 | labels.append(label_) 272 | else: 273 | labels.append("X") 274 | 275 | # Account for [CLS] and [SEP] with "- 2" 276 | if len(tokens) > max_seq_length - 2: 277 | tokens = tokens[0:(max_seq_length - 2)] 278 | labels = labels[0: (max_seq_length - 2)] 279 | 280 | # The convention in BERT is: 281 | # (a) For sequence pairs: 282 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 283 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 284 | # (b) For single sequences: 285 | # tokens: [CLS] the dog is hairy . [SEP] 286 | # type_ids: 0 0 0 0 0 0 0 287 | # 288 | # Where "type_ids" are used to indicate whether this is the first 289 | # sequence or the second sequence. The embedding vectors for `type=0` and 290 | # `type=1` were learned during pre-training and are added to the wordpiece 291 | # embedding vector (and position vector). This is not *strictly* necessary 292 | # since the [SEP] token unambiguously separates the sequences, but it makes 293 | # it easier for the model to learn the concept of sequences. 294 | # 295 | # For classification tasks, the first vector (corresponding to [CLS]) is 296 | # used as as the "sentence vector". Note that this only makes sense because 297 | # the entire model is fine-tuned. 298 | ntokens = [] 299 | segment_ids = [] 300 | label_ids = [] 301 | ntokens.append("[CLS]") 302 | segment_ids.append(0) 303 | label_ids.append(0) 304 | for i, token in enumerate(tokens): 305 | ntokens.append(token) 306 | segment_ids.append(0) 307 | label_ids.append(label_map[labels[i]]) 308 | ntokens.append("[SEP]") 309 | segment_ids.append(0) 310 | label_ids.append(0) 311 | 312 | input_ids = tokenizer.convert_tokens_to_ids(ntokens) 313 | 314 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 315 | # tokens are attended to. 316 | input_mask = [1] * len(input_ids) 317 | 318 | # Zero-pad up to the sequence length. 319 | while len(input_ids) < max_seq_length: 320 | input_ids.append(0) 321 | input_mask.append(0) 322 | segment_ids.append(0) 323 | label_ids.append(0) 324 | 325 | assert len(input_ids) == max_seq_length 326 | assert len(input_mask) == max_seq_length 327 | assert len(segment_ids) == max_seq_length 328 | assert len(label_ids) == max_seq_length 329 | 330 | if ex_index < 5: 331 | tf.logging.info("*** Example ***") 332 | tf.logging.info("guid: %s" % (example.guid)) 333 | tf.logging.info("tokens: %s" % " ".join( 334 | [tokenization.printable_text(x) for x in tokens])) 335 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 336 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 337 | tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 338 | tf.logging.info("label_ids: %s " % " ".join([str(x) for x in label_ids])) 339 | 340 | feature = InputFeatures( 341 | input_ids=input_ids, 342 | input_mask=input_mask, 343 | segment_ids=segment_ids, 344 | label_ids=label_ids) 345 | return feature 346 | 347 | 348 | def file_based_convert_examples_to_features( 349 | examples, label_list, max_seq_length, tokenizer, output_file): 350 | """Convert a set of `InputExample`s to a TFRecord file.""" 351 | 352 | writer = tf.python_io.TFRecordWriter(output_file) 353 | 354 | for (ex_index, example) in enumerate(examples): 355 | if ex_index % 10000 == 0: 356 | tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) 357 | 358 | feature = convert_single_example(ex_index, example, label_list, 359 | max_seq_length, tokenizer) 360 | 361 | def create_int_feature(values): 362 | f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) 363 | return f 364 | 365 | features = collections.OrderedDict() 366 | features["input_ids"] = create_int_feature(feature.input_ids) 367 | features["input_mask"] = create_int_feature(feature.input_mask) 368 | features["segment_ids"] = create_int_feature(feature.segment_ids) 369 | features["label_ids"] = create_int_feature(feature.label_ids) 370 | 371 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 372 | writer.write(tf_example.SerializeToString()) 373 | 374 | 375 | def file_based_input_fn_builder(input_file, seq_length, is_training, 376 | drop_remainder): 377 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 378 | 379 | name_to_features = { 380 | "input_ids": tf.FixedLenFeature([seq_length], tf.int64), 381 | "input_mask": tf.FixedLenFeature([seq_length], tf.int64), 382 | "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), 383 | "label_ids": tf.FixedLenFeature([seq_length], tf.int64), 384 | } 385 | 386 | def _decode_record(record, name_to_features): 387 | """Decodes a record to a TensorFlow example.""" 388 | example = tf.parse_single_example(record, name_to_features) 389 | 390 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 391 | # So cast all int64 to int32. 392 | for name in list(example.keys()): 393 | t = example[name] 394 | if t.dtype == tf.int64: 395 | t = tf.to_int32(t) 396 | example[name] = t 397 | 398 | return example 399 | 400 | def input_fn(params): 401 | """The actual input function.""" 402 | batch_size = params["batch_size"] 403 | 404 | # For training, we want a lot of parallel reading and shuffling. 405 | # For eval, we want no shuffling and parallel reading doesn't matter. 406 | d = tf.data.TFRecordDataset(input_file) 407 | if is_training: 408 | d = d.repeat() 409 | d = d.shuffle(buffer_size=100) 410 | 411 | d = d.apply( 412 | tf.contrib.data.map_and_batch( 413 | lambda record: _decode_record(record, name_to_features), 414 | batch_size=batch_size, 415 | drop_remainder=drop_remainder)) 416 | 417 | return d 418 | 419 | return input_fn 420 | 421 | 422 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 423 | """Truncates a sequence pair in place to the maximum length.""" 424 | 425 | # This is a simple heuristic which will always truncate the longer sequence 426 | # one token at a time. This makes more sense than truncating an equal percent 427 | # of tokens from each, since if one sequence is very short then each token 428 | # that's truncated likely contains more information than a longer sequence. 429 | while True: 430 | total_length = len(tokens_a) + len(tokens_b) 431 | if total_length <= max_length: 432 | break 433 | if len(tokens_a) > len(tokens_b): 434 | tokens_a.pop() 435 | else: 436 | tokens_b.pop() 437 | 438 | 439 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 440 | labels, num_labels, use_one_hot_embeddings): 441 | """Creates a classification model.""" 442 | model = modeling.BertModel( 443 | config=bert_config, 444 | is_training=is_training, 445 | input_ids=input_ids, 446 | input_mask=input_mask, 447 | token_type_ids=segment_ids, 448 | use_one_hot_embeddings=use_one_hot_embeddings) 449 | 450 | # In the demo, we are doing a simple classification task on the entire 451 | # segment. 452 | # 453 | # If you want to use the token-level output, use model.get_sequence_output() 454 | # instead. 455 | output_layer = model.get_sequence_output() 456 | # final_hidden_shape = modeling.get_shape_list(output_layer, expected_rank=3) 457 | # batch_size = final_hidden_shape[0] 458 | # seq_length = final_hidden_shape[1] 459 | # hidden_size = final_hidden_shape[2] 460 | 461 | hidden_size = output_layer.shape[-1].value 462 | 463 | output_weights = tf.get_variable( 464 | "output_weights", [num_labels, hidden_size], 465 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 466 | 467 | output_bias = tf.get_variable( 468 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 469 | 470 | with tf.variable_scope("loss"): 471 | if is_training: 472 | # I.e., 0.1 dropout 473 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 474 | output_layer = tf.reshape(output_layer, [-1, hidden_size]) 475 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 476 | logits = tf.nn.bias_add(logits, output_bias) 477 | # probabilities = tf.nn.softmax(logits, axis=-1) 478 | logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) 479 | log_probs = tf.nn.log_softmax(logits, axis=-1) 480 | 481 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 482 | # print(tf.shape(one_hot_labels)) 483 | # Tensor("loss/Shape_2:0", shape=(2,), dtype=int32) 484 | 485 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 486 | loss = tf.reduce_mean(per_example_loss) 487 | 488 | return (loss, per_example_loss, logits) 489 | 490 | 491 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, 492 | num_train_steps, num_warmup_steps, use_tpu, 493 | use_one_hot_embeddings): 494 | """Returns `model_fn` closure for TPUEstimator.""" 495 | 496 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 497 | """The `model_fn` for TPUEstimator.""" 498 | 499 | tf.logging.info("*** Features ***") 500 | for name in sorted(features.keys()): 501 | tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) 502 | 503 | input_ids = features["input_ids"] 504 | input_mask = features["input_mask"] 505 | segment_ids = features["segment_ids"] 506 | label_ids = features["label_ids"] 507 | 508 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 509 | 510 | (total_loss, per_example_loss, logits) = create_model( 511 | bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, 512 | num_labels, use_one_hot_embeddings) 513 | 514 | tvars = tf.trainable_variables() 515 | 516 | scaffold_fn = None 517 | if init_checkpoint: 518 | (assignment_map, initialized_variable_names 519 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 520 | if use_tpu: 521 | 522 | def tpu_scaffold(): 523 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 524 | return tf.train.Scaffold() 525 | 526 | scaffold_fn = tpu_scaffold 527 | else: 528 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 529 | 530 | tf.logging.info("**** Trainable Variables ****") 531 | for var in tvars: 532 | init_string = "" 533 | if var.name in initialized_variable_names: 534 | init_string = ", *INIT_FROM_CKPT*" 535 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 536 | init_string) 537 | 538 | output_spec = None 539 | if mode == tf.estimator.ModeKeys.TRAIN: 540 | 541 | train_op = optimization.create_optimizer( 542 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 543 | 544 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 545 | mode=mode, 546 | loss=total_loss, 547 | train_op=train_op, 548 | scaffold_fn=scaffold_fn) 549 | elif mode == tf.estimator.ModeKeys.EVAL: 550 | 551 | def metric_fn(per_example_loss, label_ids, logits): 552 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 553 | accuracy = tf.metrics.accuracy(label_ids, predictions) 554 | # precision = 555 | # recall = 556 | loss = tf.metrics.mean(per_example_loss) 557 | return { 558 | "eval_accuracy": accuracy, 559 | "eval_loss": loss, 560 | } 561 | 562 | eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) 563 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 564 | mode=mode, 565 | loss=total_loss, 566 | eval_metrics=eval_metrics, 567 | scaffold_fn=scaffold_fn) 568 | elif mode == tf.estimator.ModeKeys.PREDICT: 569 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 570 | predict_output = {'values': predictions} 571 | export_outputs = {'predictions': tf.estimator.export.PredictOutput(predict_output)} 572 | 573 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 574 | mode=mode, 575 | predictions=predict_output, 576 | export_outputs=export_outputs, 577 | scaffold_fn=scaffold_fn) 578 | else: 579 | print("error") 580 | return output_spec 581 | 582 | return model_fn 583 | 584 | 585 | def get_eval(pred_result, real_labels, label_list, max_seq_length): 586 | label_map = {} 587 | for i, label in enumerate(label_list): 588 | label_map[label] = i 589 | import itertools 590 | predictions = list(itertools.islice(pred_result, len(real_labels))) 591 | pred_labels = [] 592 | real_labels_ = [] 593 | 594 | for i in range(len(predictions)): 595 | real = real_labels[i] 596 | if len(real) > max_seq_length - 1: 597 | continue 598 | real_ = [label_map[k] for k in real.split(" ")] 599 | real_labels_.extend(real_) 600 | pred = predictions[i]["values"][1: len(real_) + 1] 601 | pred_labels.extend(pred) 602 | assert len(real_) == len(pred) 603 | from sklearn.metrics import classification_report 604 | print(classification_report(real_labels_, pred_labels)) 605 | 606 | 607 | def main(_): 608 | tf.logging.set_verbosity(tf.logging.INFO) 609 | 610 | processors = { 611 | "ner": NerProcessor, 612 | } 613 | 614 | if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: 615 | raise ValueError( 616 | "At least one of `do_train`, `do_eval` or `do_predict' must be True.") 617 | 618 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 619 | 620 | if FLAGS.max_seq_length > bert_config.max_position_embeddings: 621 | raise ValueError( 622 | "Cannot use sequence length %d because the BERT model " 623 | "was only trained up to sequence length %d" % 624 | (FLAGS.max_seq_length, bert_config.max_position_embeddings)) 625 | 626 | tf.gfile.MakeDirs(FLAGS.output_dir) 627 | 628 | task_name = FLAGS.task_name.lower() 629 | 630 | if task_name not in processors: 631 | raise ValueError("Task not found: %s" % (task_name)) 632 | 633 | processor = processors[task_name]() 634 | 635 | label_list = processor.get_labels() 636 | 637 | tokenizer = tokenization.FullTokenizer( 638 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 639 | 640 | tpu_cluster_resolver = None 641 | if FLAGS.use_tpu and FLAGS.tpu_name: 642 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( 643 | FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) 644 | 645 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 646 | run_config = tf.contrib.tpu.RunConfig( 647 | cluster=tpu_cluster_resolver, 648 | master=FLAGS.master, 649 | model_dir=FLAGS.output_dir, 650 | save_checkpoints_steps=FLAGS.save_checkpoints_steps, 651 | tpu_config=tf.contrib.tpu.TPUConfig( 652 | iterations_per_loop=FLAGS.iterations_per_loop, 653 | num_shards=FLAGS.num_tpu_cores, 654 | per_host_input_for_training=is_per_host)) 655 | 656 | train_examples = None 657 | num_train_steps = None 658 | num_warmup_steps = None 659 | if FLAGS.do_train: 660 | train_examples, _ = processor.get_train_examples(FLAGS.data_dir) 661 | num_train_steps = int( 662 | len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) 663 | num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) 664 | 665 | model_fn = model_fn_builder( 666 | bert_config=bert_config, 667 | num_labels=len(label_list), 668 | init_checkpoint=FLAGS.init_checkpoint, 669 | learning_rate=FLAGS.learning_rate, 670 | num_train_steps=num_train_steps, 671 | num_warmup_steps=num_warmup_steps, 672 | use_tpu=FLAGS.use_tpu, 673 | use_one_hot_embeddings=FLAGS.use_tpu) 674 | 675 | # If TPU is not available, this will fall back to normal Estimator on CPU 676 | # or GPU. 677 | estimator = tf.contrib.tpu.TPUEstimator( 678 | use_tpu=FLAGS.use_tpu, 679 | model_fn=model_fn, 680 | config=run_config, 681 | train_batch_size=FLAGS.train_batch_size, 682 | eval_batch_size=FLAGS.eval_batch_size, 683 | predict_batch_size=FLAGS.predict_batch_size) 684 | 685 | if FLAGS.do_train: 686 | train_file = os.path.join(FLAGS.output_dir, "train.tf_record") 687 | file_based_convert_examples_to_features( 688 | train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) 689 | tf.logging.info("***** Running training *****") 690 | tf.logging.info(" Num examples = %d", len(train_examples)) 691 | tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) 692 | tf.logging.info(" Num steps = %d", num_train_steps) 693 | train_input_fn = file_based_input_fn_builder( 694 | input_file=train_file, 695 | seq_length=FLAGS.max_seq_length, 696 | is_training=True, 697 | drop_remainder=True) 698 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 699 | 700 | if FLAGS.do_eval: 701 | eval_examples, real_labels = processor.get_dev_examples(FLAGS.data_dir) 702 | eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") 703 | file_based_convert_examples_to_features( 704 | eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) 705 | 706 | tf.logging.info("***** Running evaluation *****") 707 | tf.logging.info(" Num examples = %d", len(eval_examples)) 708 | tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) 709 | 710 | # This tells the estimator to run through the entire set. 711 | eval_steps = None 712 | # However, if running eval on the TPU, you will need to specify the 713 | # number of steps. 714 | if FLAGS.use_tpu: 715 | # Eval will be slightly WRONG on the TPU because it will truncate 716 | # the last batch. 717 | eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) 718 | 719 | eval_drop_remainder = True if FLAGS.use_tpu else False 720 | eval_input_fn = file_based_input_fn_builder( 721 | input_file=eval_file, 722 | seq_length=FLAGS.max_seq_length, 723 | is_training=False, 724 | drop_remainder=eval_drop_remainder) 725 | 726 | result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) 727 | 728 | output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") 729 | with tf.gfile.GFile(output_eval_file, "w") as writer: 730 | tf.logging.info("***** Eval results *****") 731 | for key in sorted(result.keys()): 732 | tf.logging.info(" %s = %s", key, str(result[key])) 733 | writer.write("%s = %s\n" % (key, str(result[key]))) 734 | pred_result = estimator.predict(input_fn=eval_input_fn) 735 | get_eval(pred_result, real_labels, label_list, FLAGS.max_seq_length) 736 | 737 | if FLAGS.do_predict: 738 | predict_examples = processor.get_test_examples(FLAGS.data_dir) 739 | predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") 740 | file_based_convert_examples_to_features(predict_examples, label_list, 741 | FLAGS.max_seq_length, tokenizer, 742 | predict_file) 743 | 744 | tf.logging.info("***** Running prediction*****") 745 | tf.logging.info(" Num examples = %d", len(predict_examples)) 746 | tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) 747 | 748 | if FLAGS.use_tpu: 749 | # Warning: According to tpu_estimator.py Prediction on TPU is an 750 | # experimental feature and hence not supported here 751 | raise ValueError("Prediction in TPU not supported") 752 | 753 | predict_drop_remainder = True if FLAGS.use_tpu else False 754 | predict_input_fn = file_based_input_fn_builder( 755 | input_file=predict_file, 756 | seq_length=FLAGS.max_seq_length, 757 | is_training=False, 758 | drop_remainder=predict_drop_remainder) 759 | 760 | result = estimator.predict(input_fn=predict_input_fn) 761 | 762 | output_predict_file = os.path.join(FLAGS.output_dir, "test_results.csv") 763 | with tf.gfile.GFile(output_predict_file, "w") as writer: 764 | tf.logging.info("***** Predict results *****") 765 | for prediction in result: 766 | output_line = "\t".join( 767 | str(class_probability) for class_probability in prediction) + "\n" 768 | writer.write(output_line) 769 | 770 | 771 | if __name__ == "__main__": 772 | flags.mark_flag_as_required("data_dir") 773 | flags.mark_flag_as_required("task_name") 774 | flags.mark_flag_as_required("vocab_file") 775 | flags.mark_flag_as_required("bert_config_file") 776 | flags.mark_flag_as_required("output_dir") 777 | tf.app.run() 778 | -------------------------------------------------------------------------------- /run_predict.sh: -------------------------------------------------------------------------------- 1 | export BERT_BASE_DIR=/path/bert/uncased_L-12_H-768_A-12 2 | export QUORA_DIR=/path/kaggle/quora/data 3 | export TRAINED_CLASSIFIER=/path/quora/data/result 4 | CUDA_VISIBLE_DEVICES=0 python3 ./bert/run_classifier.py \ 5 | --task_name=kaggle-quora \ 6 | --do_train=false \ 7 | --do_predict=true \ 8 | --data_dir=$QUORA_DIR/ \ 9 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 10 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 11 | --init_checkpoint=$TRAINED_CLASSIFIER/model.ckpt \ 12 | --max_seq_length=128 \ 13 | --output_dir=$QUORA_DIR/result/ 14 | -------------------------------------------------------------------------------- /run_train.sh: -------------------------------------------------------------------------------- 1 | export BERT_BASE_DIR=/path/bert/uncased_L-12_H-768_A-12 2 | export QUORA_DIR=/path/kaggle/quora/data 3 | export CUDA_VISIBLE_DEVICES=1,0 4 | python3 ./bert/run_classifier.py \ 5 | --task_name=kaggle-quora \ 6 | --do_train=true \ 7 | --do_eval=true \ 8 | --data_dir=$QUORA_DIR/ \ 9 | --vocab_file=$BERT_BASE_DIR/vocab.txt \ 10 | --bert_config_file=$BERT_BASE_DIR/bert_config.json \ 11 | --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ 12 | --max_seq_length=128 \ 13 | --train_batch_size=32 \ 14 | --eval_batch_size=64 \ 15 | --pred_batch_size=8 \ 16 | --learning_rate=2e-5 \ 17 | --num_train_epochs=3.0 \ 18 | --output_dir=$QUORA_DIR/result/ \ 19 | --------------------------------------------------------------------------------