├── README.md
├── img
    └── result.jpg
├── run_classifier.py
├── run_ner.py
├── run_predict.sh
└── run_train.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # bert examples
 2 | 
 3 | #### 介绍  
 4 | `run_classifier.py` 是基于谷歌bert实现了[Quora Insincere Questions Classification](https://www.kaggle.com/c/quora-insincere-questions-classification)二分类比赛。  
 5 | `run_ner.py`是基于[瑞金医院AI大赛 第一赛季](https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.100066.0.0.1cdc33af6yLOXD&raceId=231687)数据和bert写的一个命名实体识别。
 6 | 
 7 | #### 结果  
 8 | :blush:上述两个程序亲测都是可以跑通的，这里只贴下ner 验证集的效果。  
 9 | ![result](/img/result.jpg)
10 | 


--------------------------------------------------------------------------------
/img/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pydataman/bert_examples/9dbe02f203a56ff2e473e14ed0d49bca7e3449a4/img/result.jpg


--------------------------------------------------------------------------------
/run_classifier.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BERT finetuning runner."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import csv
 23 | import os
 24 | import modeling
 25 | import optimization
 26 | import tokenization
 27 | import tensorflow as tf
 28 | 
 29 | flags = tf.flags
 30 | 
 31 | FLAGS = flags.FLAGS
 32 | 
 33 | ## Required parameters
 34 | flags.DEFINE_string(
 35 |     "data_dir", None,
 36 |     "The input data dir. Should contain the .tsv files (or other data files) "
 37 |     "for the task.")
 38 | 
 39 | flags.DEFINE_string(
 40 |     "bert_config_file", None,
 41 |     "The config json file corresponding to the pre-trained BERT model. "
 42 |     "This specifies the model architecture.")
 43 | 
 44 | flags.DEFINE_string("task_name", None, "The name of the task to train.")
 45 | 
 46 | flags.DEFINE_string("vocab_file", None,
 47 |                     "The vocabulary file that the BERT model was trained on.")
 48 | 
 49 | flags.DEFINE_string(
 50 |     "output_dir", None,
 51 |     "The output directory where the model checkpoints will be written.")
 52 | 
 53 | ## Other parameters
 54 | 
 55 | flags.DEFINE_string(
 56 |     "init_checkpoint", None,
 57 |     "Initial checkpoint (usually from a pre-trained BERT model).")
 58 | 
 59 | flags.DEFINE_bool(
 60 |     "do_lower_case", True,
 61 |     "Whether to lower case the input text. Should be True for uncased "
 62 |     "models and False for cased models.")
 63 | 
 64 | flags.DEFINE_integer(
 65 |     "max_seq_length", 128,
 66 |     "The maximum total input sequence length after WordPiece tokenization. "
 67 |     "Sequences longer than this will be truncated, and sequences shorter "
 68 |     "than this will be padded.")
 69 | 
 70 | flags.DEFINE_bool("do_train", False, "Whether to run training.")
 71 | 
 72 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
 73 | 
 74 | flags.DEFINE_bool(
 75 |     "do_predict", False,
 76 |     "Whether to run the model in inference mode on the test set.")
 77 | 
 78 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
 79 | 
 80 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
 81 | 
 82 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
 83 | 
 84 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 85 | 
 86 | flags.DEFINE_float("num_train_epochs", 3.0,
 87 |                    "Total number of training epochs to perform.")
 88 | 
 89 | flags.DEFINE_float(
 90 |     "warmup_proportion", 0.1,
 91 |     "Proportion of training to perform linear learning rate warmup for. "
 92 |     "E.g., 0.1 = 10% of training.")
 93 | 
 94 | flags.DEFINE_integer("save_checkpoints_steps", 1000,
 95 |                      "How often to save the model checkpoint.")
 96 | 
 97 | flags.DEFINE_integer("iterations_per_loop", 1000,
 98 |                      "How many steps to make in each estimator call.")
 99 | 
100 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
101 | 
102 | tf.flags.DEFINE_string(
103 |     "tpu_name", None,
104 |     "The Cloud TPU to use for training. This should be either the name "
105 |     "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
106 |     "url.")
107 | 
108 | tf.flags.DEFINE_string(
109 |     "tpu_zone", None,
110 |     "[Optional] GCE zone where the Cloud TPU is located in. If not "
111 |     "specified, we will attempt to automatically detect the GCE project from "
112 |     "metadata.")
113 | 
114 | tf.flags.DEFINE_string(
115 |     "gcp_project", None,
116 |     "[Optional] Project name for the Cloud TPU-enabled project. If not "
117 |     "specified, we will attempt to automatically detect the GCE project from "
118 |     "metadata.")
119 | 
120 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
121 | 
122 | flags.DEFINE_integer(
123 |     "num_tpu_cores", 8,
124 |     "Only used if `use_tpu` is True. Total number of TPU cores to use.")
125 | 
126 | 
127 | class InputExample(object):
128 |     """A single training/test example for simple sequence classification."""
129 | 
130 |     def __init__(self, guid, text_a, text_b=None, label=None):
131 |         """Constructs a InputExample.
132 | 
133 |         Args:
134 |           guid: Unique id for the example.
135 |           text_a: string. The untokenized text of the first sequence. For single
136 |             sequence tasks, only this sequence must be specified.
137 |           text_b: (Optional) string. The untokenized text of the second sequence.
138 |             Only must be specified for sequence pair tasks.
139 |           label: (Optional) string. The label of the example. This should be
140 |             specified for train and dev examples, but not for test examples.
141 |         """
142 |         self.guid = guid
143 |         self.text_a = text_a
144 |         self.text_b = text_b
145 |         self.label = label
146 | 
147 | 
148 | class InputFeatures(object):
149 |     """A single set of features of data."""
150 | 
151 |     def __init__(self, input_ids, input_mask, segment_ids, label_id):
152 |         self.input_ids = input_ids
153 |         self.input_mask = input_mask
154 |         self.segment_ids = segment_ids
155 |         self.label_id = label_id
156 | 
157 | 
158 | class DataProcessor(object):
159 |     """Base class for data converters for sequence classification data sets."""
160 | 
161 |     def get_train_examples(self, data_dir):
162 |         """Gets a collection of `InputExample`s for the train set."""
163 |         raise NotImplementedError()
164 | 
165 |     def get_dev_examples(self, data_dir):
166 |         """Gets a collection of `InputExample`s for the dev set."""
167 |         raise NotImplementedError()
168 | 
169 |     def get_test_examples(self, data_dir):
170 |         """Gets a collection of `InputExample`s for prediction."""
171 |         raise NotImplementedError()
172 | 
173 |     def get_labels(self):
174 |         """Gets the list of labels for this data set."""
175 |         raise NotImplementedError()
176 | 
177 |     @classmethod
178 |     def _read_tsv(cls, input_file, quotechar=None):
179 |         """Reads a tab separated value file."""
180 |         with tf.gfile.Open(input_file, "r") as f:
181 |             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
182 |             lines = []
183 |             for line in reader:
184 |                 lines.append(line)
185 |             return lines
186 | 
187 | 
188 | class XnliProcessor(DataProcessor):
189 |     """Processor for the XNLI data set."""
190 | 
191 |     def __init__(self):
192 |         self.language = "zh"
193 | 
194 |     def get_train_examples(self, data_dir):
195 |         """See base class."""
196 |         lines = self._read_tsv(
197 |             os.path.join(data_dir, "multinli",
198 |                          "multinli.train.%s.tsv" % self.language))
199 |         examples = []
200 |         for (i, line) in enumerate(lines):
201 |             if i == 0:
202 |                 continue
203 |             guid = "train-%d" % (i)
204 |             text_a = tokenization.convert_to_unicode(line[0])
205 |             text_b = tokenization.convert_to_unicode(line[1])
206 |             label = tokenization.convert_to_unicode(line[2])
207 |             if label == tokenization.convert_to_unicode("contradictory"):
208 |                 label = tokenization.convert_to_unicode("contradiction")
209 |             examples.append(
210 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
211 |         return examples
212 | 
213 |     def get_dev_examples(self, data_dir):
214 |         """See base class."""
215 |         lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
216 |         examples = []
217 |         for (i, line) in enumerate(lines):
218 |             if i == 0:
219 |                 continue
220 |             guid = "dev-%d" % (i)
221 |             language = tokenization.convert_to_unicode(line[0])
222 |             if language != tokenization.convert_to_unicode(self.language):
223 |                 continue
224 |             text_a = tokenization.convert_to_unicode(line[6])
225 |             text_b = tokenization.convert_to_unicode(line[7])
226 |             label = tokenization.convert_to_unicode(line[1])
227 |             examples.append(
228 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
229 |         return examples
230 | 
231 |     def get_labels(self):
232 |         """See base class."""
233 |         return ["contradiction", "entailment", "neutral"]
234 | 
235 | 
236 | class MnliProcessor(DataProcessor):
237 |     """Processor for the MultiNLI data set (GLUE version)."""
238 | 
239 |     def get_train_examples(self, data_dir):
240 |         """See base class."""
241 |         return self._create_examples(
242 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
243 | 
244 |     def get_dev_examples(self, data_dir):
245 |         """See base class."""
246 |         return self._create_examples(
247 |             self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
248 |             "dev_matched")
249 | 
250 |     def get_test_examples(self, data_dir):
251 |         """See base class."""
252 |         return self._create_examples(
253 |             self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
254 | 
255 |     def get_labels(self):
256 |         """See base class."""
257 |         return ["contradiction", "entailment", "neutral"]
258 | 
259 |     def _create_examples(self, lines, set_type):
260 |         """Creates examples for the training and dev sets."""
261 |         examples = []
262 |         for (i, line) in enumerate(lines):
263 |             if i == 0:
264 |                 continue
265 |             guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
266 |             text_a = tokenization.convert_to_unicode(line[8])
267 |             text_b = tokenization.convert_to_unicode(line[9])
268 |             if set_type == "test":
269 |                 label = "contradiction"
270 |             else:
271 |                 label = tokenization.convert_to_unicode(line[-1])
272 |             examples.append(
273 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
274 |         return examples
275 | 
276 | 
277 | class MrpcProcessor(DataProcessor):
278 |     """Processor for the MRPC data set (GLUE version)."""
279 | 
280 |     def get_train_examples(self, data_dir):
281 |         """See base class."""
282 |         return self._create_examples(
283 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
284 | 
285 |     def get_dev_examples(self, data_dir):
286 |         """See base class."""
287 |         return self._create_examples(
288 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
289 | 
290 |     def get_test_examples(self, data_dir):
291 |         """See base class."""
292 |         return self._create_examples(
293 |             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
294 | 
295 |     def get_labels(self):
296 |         """See base class."""
297 |         return ["0", "1"]
298 | 
299 |     def _create_examples(self, lines, set_type):
300 |         """Creates examples for the training and dev sets."""
301 |         examples = []
302 |         for (i, line) in enumerate(lines):
303 |             if i == 0:
304 |                 continue
305 |             guid = "%s-%s" % (set_type, i)
306 |             text_a = tokenization.convert_to_unicode(line[3])
307 |             text_b = tokenization.convert_to_unicode(line[4])
308 |             if set_type == "test":
309 |                 label = "0"
310 |             else:
311 |                 label = tokenization.convert_to_unicode(line[0])
312 |             examples.append(
313 |                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
314 |         return examples
315 | 
316 | 
317 | class ColaProcessor(DataProcessor):
318 |     """Processor for the CoLA data set (GLUE version)."""
319 | 
320 |     def get_train_examples(self, data_dir):
321 |         """See base class."""
322 |         return self._create_examples(
323 |             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
324 | 
325 |     def get_dev_examples(self, data_dir):
326 |         """See base class."""
327 |         return self._create_examples(
328 |             self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
329 | 
330 |     def get_test_examples(self, data_dir):
331 |         """See base class."""
332 |         return self._create_examples(
333 |             self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
334 | 
335 |     def get_labels(self):
336 |         """See base class."""
337 |         return ["0", "1"]
338 | 
339 |     def _create_examples(self, lines, set_type):
340 |         """Creates examples for the training and dev sets."""
341 |         examples = []
342 |         for (i, line) in enumerate(lines):
343 |             # Only the test set has a header
344 |             if set_type == "test" and i == 0:
345 |                 continue
346 |             guid = "%s-%s" % (set_type, i)
347 |             if set_type == "test":
348 |                 text_a = tokenization.convert_to_unicode(line[1])
349 |                 label = "0"
350 |             else:
351 |                 text_a = tokenization.convert_to_unicode(line[3])
352 |                 label = tokenization.convert_to_unicode(line[1])
353 |             examples.append(
354 |                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
355 |         return examples
356 | 
357 | 
358 | class QuoraProcessor(DataProcessor):
359 |     """processor for kaggle quora"""
360 |     def get_train_examples(self, data_dir):
361 |         return self._create_examples(
362 |             self._read_tsv(os.path.join(data_dir, "train.csv")), "train"
363 |         )
364 | 
365 |     def get_dev_examples(self, data_dir):
366 |         return self._create_examples(
367 |             self._read_tsv(os.path.join(data_dir, "dev.csv")), "dev"
368 |         )
369 | 
370 |     def get_test_examples(self, data_dir):
371 |         return self._create_examples(
372 |             self._read_tsv(os.path.join(data_dir, "test.csv")), "test"
373 |         )
374 | 
375 |     def get_labels(self):
376 |         return ["0", "1"]
377 | 
378 |     def _create_examples(self, lines, set_type):
379 |         examples = []
380 |         for (i, line) in enumerate(lines):
381 |             if i == 0:
382 |                 continue
383 |             guid = "%s-%s" % (set_type, i)
384 |             if set_type == "test":
385 |                 if len(line) != 2:
386 |                     continue
387 |                 text_a = tokenization.convert_to_unicode(line[1])
388 |                 label = "0"
389 |             else:
390 |                 if len(line) != 3:
391 |                     continue
392 |                 text_a = tokenization.convert_to_unicode(line[1])
393 |                 label = tokenization.convert_to_unicode(line[2])
394 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
395 |         return examples
396 | 
397 | 
398 | def convert_single_example(ex_index, example, label_list, max_seq_length,
399 |                            tokenizer):
400 |     """Converts a single `InputExample` into a single `InputFeatures`."""
401 |     label_map = {}
402 |     for (i, label) in enumerate(label_list):
403 |         label_map[label] = i
404 | 
405 |     tokens_a = tokenizer.tokenize(example.text_a)
406 |     tokens_b = None
407 |     if example.text_b:
408 |         tokens_b = tokenizer.tokenize(example.text_b)
409 | 
410 |     if tokens_b:
411 |         # Modifies `tokens_a` and `tokens_b` in place so that the total
412 |         # length is less than the specified length.
413 |         # Account for [CLS], [SEP], [SEP] with "- 3"
414 |         _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
415 |     else:
416 |         # Account for [CLS] and [SEP] with "- 2"
417 |         if len(tokens_a) > max_seq_length - 2:
418 |             tokens_a = tokens_a[0:(max_seq_length - 2)]
419 | 
420 |     # The convention in BERT is:
421 |     # (a) For sequence pairs:
422 |     #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
423 |     #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
424 |     # (b) For single sequences:
425 |     #  tokens:   [CLS] the dog is hairy . [SEP]
426 |     #  type_ids: 0     0   0   0  0     0 0
427 |     #
428 |     # Where "type_ids" are used to indicate whether this is the first
429 |     # sequence or the second sequence. The embedding vectors for `type=0` and
430 |     # `type=1` were learned during pre-training and are added to the wordpiece
431 |     # embedding vector (and position vector). This is not *strictly* necessary
432 |     # since the [SEP] token unambiguously separates the sequences, but it makes
433 |     # it easier for the model to learn the concept of sequences.
434 |     #
435 |     # For classification tasks, the first vector (corresponding to [CLS]) is
436 |     # used as as the "sentence vector". Note that this only makes sense because
437 |     # the entire model is fine-tuned.
438 |     tokens = []
439 |     segment_ids = []
440 |     tokens.append("[CLS]")
441 |     segment_ids.append(0)
442 |     for token in tokens_a:
443 |         tokens.append(token)
444 |         segment_ids.append(0)
445 |     tokens.append("[SEP]")
446 |     segment_ids.append(0)
447 | 
448 |     if tokens_b:
449 |         for token in tokens_b:
450 |             tokens.append(token)
451 |             segment_ids.append(1)
452 |         tokens.append("[SEP]")
453 |         segment_ids.append(1)
454 | 
455 |     input_ids = tokenizer.convert_tokens_to_ids(tokens)
456 | 
457 |     # The mask has 1 for real tokens and 0 for padding tokens. Only real
458 |     # tokens are attended to.
459 |     input_mask = [1] * len(input_ids)
460 | 
461 |     # Zero-pad up to the sequence length.
462 |     while len(input_ids) < max_seq_length:
463 |         input_ids.append(0)
464 |         input_mask.append(0)
465 |         segment_ids.append(0)
466 | 
467 |     assert len(input_ids) == max_seq_length
468 |     assert len(input_mask) == max_seq_length
469 |     assert len(segment_ids) == max_seq_length
470 | 
471 |     label_id = label_map[example.label]
472 |     if ex_index < 5:
473 |         tf.logging.info("*** Example ***")
474 |         tf.logging.info("guid: %s" % (example.guid))
475 |         tf.logging.info("tokens: %s" % " ".join(
476 |             [tokenization.printable_text(x) for x in tokens]))
477 |         tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
478 |         tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
479 |         tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
480 |         tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
481 | 
482 |     feature = InputFeatures(
483 |         input_ids=input_ids,
484 |         input_mask=input_mask,
485 |         segment_ids=segment_ids,
486 |         label_id=label_id)
487 |     return feature
488 | 
489 | 
490 | def file_based_convert_examples_to_features(
491 |         examples, label_list, max_seq_length, tokenizer, output_file):
492 |     """Convert a set of `InputExample`s to a TFRecord file."""
493 | 
494 |     writer = tf.python_io.TFRecordWriter(output_file)
495 | 
496 |     for (ex_index, example) in enumerate(examples):
497 |         if ex_index % 10000 == 0:
498 |             tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
499 | 
500 |         feature = convert_single_example(ex_index, example, label_list,
501 |                                          max_seq_length, tokenizer)
502 | 
503 |         def create_int_feature(values):
504 |             f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
505 |             return f
506 | 
507 |         features = collections.OrderedDict()
508 |         features["input_ids"] = create_int_feature(feature.input_ids)
509 |         features["input_mask"] = create_int_feature(feature.input_mask)
510 |         features["segment_ids"] = create_int_feature(feature.segment_ids)
511 |         features["label_ids"] = create_int_feature([feature.label_id])
512 | 
513 |         tf_example = tf.train.Example(features=tf.train.Features(feature=features))
514 |         writer.write(tf_example.SerializeToString())
515 | 
516 | 
517 | def file_based_input_fn_builder(input_file, seq_length, is_training,
518 |                                 drop_remainder):
519 |     """Creates an `input_fn` closure to be passed to TPUEstimator."""
520 | 
521 |     name_to_features = {
522 |         "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
523 |         "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
524 |         "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
525 |         "label_ids": tf.FixedLenFeature([], tf.int64),
526 |     }
527 | 
528 |     def _decode_record(record, name_to_features):
529 |         """Decodes a record to a TensorFlow example."""
530 |         example = tf.parse_single_example(record, name_to_features)
531 | 
532 |         # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
533 |         # So cast all int64 to int32.
534 |         for name in list(example.keys()):
535 |             t = example[name]
536 |             if t.dtype == tf.int64:
537 |                 t = tf.to_int32(t)
538 |             example[name] = t
539 | 
540 |         return example
541 | 
542 |     def input_fn(params):
543 |         """The actual input function."""
544 |         batch_size = params["batch_size"]
545 | 
546 |         # For training, we want a lot of parallel reading and shuffling.
547 |         # For eval, we want no shuffling and parallel reading doesn't matter.
548 |         d = tf.data.TFRecordDataset(input_file)
549 |         if is_training:
550 |             d = d.repeat()
551 |             d = d.shuffle(buffer_size=100)
552 | 
553 |         d = d.apply(
554 |             tf.contrib.data.map_and_batch(
555 |                 lambda record: _decode_record(record, name_to_features),
556 |                 batch_size=batch_size,
557 |                 drop_remainder=drop_remainder))
558 | 
559 |         return d
560 | 
561 |     return input_fn
562 | 
563 | 
564 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
565 |     """Truncates a sequence pair in place to the maximum length."""
566 | 
567 |     # This is a simple heuristic which will always truncate the longer sequence
568 |     # one token at a time. This makes more sense than truncating an equal percent
569 |     # of tokens from each, since if one sequence is very short then each token
570 |     # that's truncated likely contains more information than a longer sequence.
571 |     while True:
572 |         total_length = len(tokens_a) + len(tokens_b)
573 |         if total_length <= max_length:
574 |             break
575 |         if len(tokens_a) > len(tokens_b):
576 |             tokens_a.pop()
577 |         else:
578 |             tokens_b.pop()
579 | 
580 | 
581 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
582 |                  labels, num_labels, use_one_hot_embeddings):
583 |     """Creates a classification model."""
584 |     model = modeling.BertModel(
585 |         config=bert_config,
586 |         is_training=is_training,
587 |         input_ids=input_ids,
588 |         input_mask=input_mask,
589 |         token_type_ids=segment_ids,
590 |         use_one_hot_embeddings=use_one_hot_embeddings)
591 | 
592 |     # In the demo, we are doing a simple classification task on the entire
593 |     # segment.
594 |     #
595 |     # If you want to use the token-level output, use model.get_sequence_output()
596 |     # instead.
597 |     output_layer = model.get_pooled_output()
598 | 
599 |     hidden_size = output_layer.shape[-1].value
600 | 
601 |     output_weights = tf.get_variable(
602 |         "output_weights", [num_labels, hidden_size],
603 |         initializer=tf.truncated_normal_initializer(stddev=0.02))
604 | 
605 |     output_bias = tf.get_variable(
606 |         "output_bias", [num_labels], initializer=tf.zeros_initializer())
607 | 
608 |     with tf.variable_scope("loss"):
609 |         if is_training:
610 |             # I.e., 0.1 dropout
611 |             output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
612 | 
613 |         logits = tf.matmul(output_layer, output_weights, transpose_b=True)
614 |         logits = tf.nn.bias_add(logits, output_bias)
615 |         probabilities = tf.nn.softmax(logits, axis=-1)
616 |         log_probs = tf.nn.log_softmax(logits, axis=-1)
617 | 
618 |         one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
619 | 
620 |         per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
621 |         loss = tf.reduce_mean(per_example_loss)
622 | 
623 |         return (loss, per_example_loss, logits, probabilities)
624 | 
625 | 
626 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
627 |                      num_train_steps, num_warmup_steps, use_tpu,
628 |                      use_one_hot_embeddings):
629 |     """Returns `model_fn` closure for TPUEstimator."""
630 | 
631 |     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
632 |         """The `model_fn` for TPUEstimator."""
633 | 
634 |         tf.logging.info("*** Features ***")
635 |         for name in sorted(features.keys()):
636 |             tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
637 | 
638 |         input_ids = features["input_ids"]
639 |         input_mask = features["input_mask"]
640 |         segment_ids = features["segment_ids"]
641 |         label_ids = features["label_ids"]
642 | 
643 |         is_training = (mode == tf.estimator.ModeKeys.TRAIN)
644 | 
645 |         (total_loss, per_example_loss, logits, probabilities) = create_model(
646 |             bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
647 |             num_labels, use_one_hot_embeddings)
648 | 
649 |         tvars = tf.trainable_variables()
650 | 
651 |         scaffold_fn = None
652 |         if init_checkpoint:
653 |             (assignment_map, initialized_variable_names
654 |              ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
655 |             if use_tpu:
656 | 
657 |                 def tpu_scaffold():
658 |                     tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
659 |                     return tf.train.Scaffold()
660 | 
661 |                 scaffold_fn = tpu_scaffold
662 |             else:
663 |                 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
664 | 
665 |         tf.logging.info("**** Trainable Variables ****")
666 |         for var in tvars:
667 |             init_string = ""
668 |             if var.name in initialized_variable_names:
669 |                 init_string = ", *INIT_FROM_CKPT*"
670 |             tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
671 |                             init_string)
672 | 
673 |         output_spec = None
674 |         if mode == tf.estimator.ModeKeys.TRAIN:
675 | 
676 |             train_op = optimization.create_optimizer(
677 |                 total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
678 | 
679 |             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
680 |                 mode=mode,
681 |                 loss=total_loss,
682 |                 train_op=train_op,
683 |                 scaffold_fn=scaffold_fn)
684 |         elif mode == tf.estimator.ModeKeys.EVAL:
685 | 
686 |             def metric_fn(per_example_loss, label_ids, logits):
687 |                 predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
688 |                 accuracy = tf.metrics.accuracy(label_ids, predictions)
689 |                 loss = tf.metrics.mean(per_example_loss)
690 |                 return {
691 |                     "eval_accuracy": accuracy,
692 |                     "eval_loss": loss,
693 |                 }
694 | 
695 |             eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
696 |             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
697 |                 mode=mode,
698 |                 loss=total_loss,
699 |                 eval_metrics=eval_metrics,
700 |                 scaffold_fn=scaffold_fn)
701 |         else:
702 |             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
703 |                 mode=mode, predictions=probabilities, scaffold_fn=scaffold_fn)
704 |         return output_spec
705 | 
706 |     return model_fn
707 | 
708 | 
709 | # This function is not used by this file but is still used by the Colab and
710 | # people who depend on it.
711 | def input_fn_builder(features, seq_length, is_training, drop_remainder):
712 |     """Creates an `input_fn` closure to be passed to TPUEstimator."""
713 | 
714 |     all_input_ids = []
715 |     all_input_mask = []
716 |     all_segment_ids = []
717 |     all_label_ids = []
718 | 
719 |     for feature in features:
720 |         all_input_ids.append(feature.input_ids)
721 |         all_input_mask.append(feature.input_mask)
722 |         all_segment_ids.append(feature.segment_ids)
723 |         all_label_ids.append(feature.label_id)
724 | 
725 |     def input_fn(params):
726 |         """The actual input function."""
727 |         batch_size = params["batch_size"]
728 | 
729 |         num_examples = len(features)
730 | 
731 |         # This is for demo purposes and does NOT scale to large data sets. We do
732 |         # not use Dataset.from_generator() because that uses tf.py_func which is
733 |         # not TPU compatible. The right way to load data is with TFRecordReader.
734 |         d = tf.data.Dataset.from_tensor_slices({
735 |             "input_ids":
736 |                 tf.constant(
737 |                     all_input_ids, shape=[num_examples, seq_length],
738 |                     dtype=tf.int32),
739 |             "input_mask":
740 |                 tf.constant(
741 |                     all_input_mask,
742 |                     shape=[num_examples, seq_length],
743 |                     dtype=tf.int32),
744 |             "segment_ids":
745 |                 tf.constant(
746 |                     all_segment_ids,
747 |                     shape=[num_examples, seq_length],
748 |                     dtype=tf.int32),
749 |             "label_ids":
750 |                 tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
751 |         })
752 | 
753 |         if is_training:
754 |             d = d.repeat()
755 |             d = d.shuffle(buffer_size=100)
756 | 
757 |         d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
758 |         return d
759 | 
760 |     return input_fn
761 | 
762 | 
763 | # This function is not used by this file but is still used by the Colab and
764 | # people who depend on it.
765 | def convert_examples_to_features(examples, label_list, max_seq_length,
766 |                                  tokenizer):
767 |     """Convert a set of `InputExample`s to a list of `InputFeatures`."""
768 | 
769 |     features = []
770 |     for (ex_index, example) in enumerate(examples):
771 |         if ex_index % 10000 == 0:
772 |             tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
773 | 
774 |         feature = convert_single_example(ex_index, example, label_list,
775 |                                          max_seq_length, tokenizer)
776 | 
777 |         features.append(feature)
778 |     return features
779 | 
780 | 
781 | def main(_):
782 |     tf.logging.set_verbosity(tf.logging.INFO)
783 | 
784 |     processors = {
785 |         "cola": ColaProcessor,
786 |         "mnli": MnliProcessor,
787 |         "mrpc": MrpcProcessor,
788 |         "xnli": XnliProcessor,
789 |         "kaggle-quora": QuoraProcessor,
790 |     }
791 | 
792 |     if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
793 |         raise ValueError(
794 |             "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
795 | 
796 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
797 | 
798 |     if FLAGS.max_seq_length > bert_config.max_position_embeddings:
799 |         raise ValueError(
800 |             "Cannot use sequence length %d because the BERT model "
801 |             "was only trained up to sequence length %d" %
802 |             (FLAGS.max_seq_length, bert_config.max_position_embeddings))
803 | 
804 |     tf.gfile.MakeDirs(FLAGS.output_dir)
805 | 
806 |     task_name = FLAGS.task_name.lower()
807 | 
808 |     if task_name not in processors:
809 |         raise ValueError("Task not found: %s" % (task_name))
810 | 
811 |     processor = processors[task_name]()
812 | 
813 |     label_list = processor.get_labels()
814 | 
815 |     tokenizer = tokenization.FullTokenizer(
816 |         vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
817 | 
818 |     tpu_cluster_resolver = None
819 |     if FLAGS.use_tpu and FLAGS.tpu_name:
820 |         tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
821 |             FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
822 | 
823 |     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
824 |     run_config = tf.contrib.tpu.RunConfig(
825 |         cluster=tpu_cluster_resolver,
826 |         master=FLAGS.master,
827 |         model_dir=FLAGS.output_dir,
828 |         save_checkpoints_steps=FLAGS.save_checkpoints_steps,
829 |         tpu_config=tf.contrib.tpu.TPUConfig(
830 |             iterations_per_loop=FLAGS.iterations_per_loop,
831 |             num_shards=FLAGS.num_tpu_cores,
832 |             per_host_input_for_training=is_per_host))
833 | 
834 |     train_examples = None
835 |     num_train_steps = None
836 |     num_warmup_steps = None
837 |     if FLAGS.do_train:
838 |         train_examples = processor.get_train_examples(FLAGS.data_dir)
839 |         num_train_steps = int(
840 |             len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
841 |         num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
842 | 
843 |     model_fn = model_fn_builder(
844 |         bert_config=bert_config,
845 |         num_labels=len(label_list),
846 |         init_checkpoint=FLAGS.init_checkpoint,
847 |         learning_rate=FLAGS.learning_rate,
848 |         num_train_steps=num_train_steps,
849 |         num_warmup_steps=num_warmup_steps,
850 |         use_tpu=FLAGS.use_tpu,
851 |         use_one_hot_embeddings=FLAGS.use_tpu)
852 | 
853 |     # If TPU is not available, this will fall back to normal Estimator on CPU
854 |     # or GPU.
855 |     estimator = tf.contrib.tpu.TPUEstimator(
856 |         use_tpu=FLAGS.use_tpu,
857 |         model_fn=model_fn,
858 |         config=run_config,
859 |         train_batch_size=FLAGS.train_batch_size,
860 |         eval_batch_size=FLAGS.eval_batch_size,
861 |         predict_batch_size=FLAGS.predict_batch_size)
862 | 
863 |     if FLAGS.do_train:
864 |         train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
865 |         file_based_convert_examples_to_features(
866 |             train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
867 |         tf.logging.info("***** Running training *****")
868 |         tf.logging.info("  Num examples = %d", len(train_examples))
869 |         tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
870 |         tf.logging.info("  Num steps = %d", num_train_steps)
871 |         train_input_fn = file_based_input_fn_builder(
872 |             input_file=train_file,
873 |             seq_length=FLAGS.max_seq_length,
874 |             is_training=True,
875 |             drop_remainder=True)
876 |         estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
877 | 
878 |     if FLAGS.do_eval:
879 |         eval_examples = processor.get_dev_examples(FLAGS.data_dir)
880 |         eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
881 |         file_based_convert_examples_to_features(
882 |             eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
883 | 
884 |         tf.logging.info("***** Running evaluation *****")
885 |         tf.logging.info("  Num examples = %d", len(eval_examples))
886 |         tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
887 | 
888 |         # This tells the estimator to run through the entire set.
889 |         eval_steps = None
890 |         # However, if running eval on the TPU, you will need to specify the
891 |         # number of steps.
892 |         if FLAGS.use_tpu:
893 |             # Eval will be slightly WRONG on the TPU because it will truncate
894 |             # the last batch.
895 |             eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
896 | 
897 |         eval_drop_remainder = True if FLAGS.use_tpu else False
898 |         eval_input_fn = file_based_input_fn_builder(
899 |             input_file=eval_file,
900 |             seq_length=FLAGS.max_seq_length,
901 |             is_training=False,
902 |             drop_remainder=eval_drop_remainder)
903 | 
904 |         result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
905 | 
906 |         output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
907 |         with tf.gfile.GFile(output_eval_file, "w") as writer:
908 |             tf.logging.info("***** Eval results *****")
909 |             for key in sorted(result.keys()):
910 |                 tf.logging.info("  %s = %s", key, str(result[key]))
911 |                 writer.write("%s = %s\n" % (key, str(result[key])))
912 | 
913 |     if FLAGS.do_predict:
914 |         predict_examples = processor.get_test_examples(FLAGS.data_dir)
915 |         predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
916 |         file_based_convert_examples_to_features(predict_examples, label_list,
917 |                                                 FLAGS.max_seq_length, tokenizer,
918 |                                                 predict_file)
919 | 
920 |         tf.logging.info("***** Running prediction*****")
921 |         tf.logging.info("  Num examples = %d", len(predict_examples))
922 |         tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
923 | 
924 |         if FLAGS.use_tpu:
925 |             # Warning: According to tpu_estimator.py Prediction on TPU is an
926 |             # experimental feature and hence not supported here
927 |             raise ValueError("Prediction in TPU not supported")
928 | 
929 |         predict_drop_remainder = True if FLAGS.use_tpu else False
930 |         predict_input_fn = file_based_input_fn_builder(
931 |             input_file=predict_file,
932 |             seq_length=FLAGS.max_seq_length,
933 |             is_training=False,
934 |             drop_remainder=predict_drop_remainder)
935 | 
936 |         result = estimator.predict(input_fn=predict_input_fn)
937 | 
938 |         output_predict_file = os.path.join(FLAGS.output_dir, "test_results.csv")
939 |         with tf.gfile.GFile(output_predict_file, "w") as writer:
940 |             tf.logging.info("***** Predict results *****")
941 |             for prediction in result:
942 |                 '''
943 |                 输出0,1的概率
944 |                 '''
945 |                 output_line = "\t".join(
946 |                     str(class_probability) for class_probability in prediction) + "\n"
947 |                 writer.write(output_line)
948 | 
949 | 
950 | if __name__ == "__main__":
951 |     flags.mark_flag_as_required("data_dir")
952 |     flags.mark_flag_as_required("task_name")
953 |     flags.mark_flag_as_required("vocab_file")
954 |     flags.mark_flag_as_required("bert_config_file")
955 |     flags.mark_flag_as_required("output_dir")
956 |     tf.app.run()
957 | 


--------------------------------------------------------------------------------
/run_ner.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """BERT finetuning runner."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import csv
 23 | import os
 24 | import modeling
 25 | import optimization
 26 | import tokenization
 27 | import tensorflow as tf
 28 | from sklearn.metrics import f1_score, precision_score, recall_score
 29 | import codecs
 30 | 
 31 | flags = tf.flags
 32 | 
 33 | FLAGS = flags.FLAGS
 34 | 
 35 | ## Required parameters
 36 | flags.DEFINE_string(
 37 |     "data_dir", None,
 38 |     "The input data dir. Should contain the .tsv files (or other data files) "
 39 |     "for the task.")
 40 | 
 41 | flags.DEFINE_string(
 42 |     "bert_config_file", None,
 43 |     "The config json file corresponding to the pre-trained BERT model. "
 44 |     "This specifies the model architecture.")
 45 | 
 46 | flags.DEFINE_string("task_name", None, "The name of the task to train.")
 47 | 
 48 | flags.DEFINE_string("vocab_file", None,
 49 |                     "The vocabulary file that the BERT model was trained on.")
 50 | 
 51 | flags.DEFINE_string(
 52 |     "output_dir", None,
 53 |     "The output directory where the model checkpoints will be written.")
 54 | 
 55 | ## Other parameters
 56 | 
 57 | flags.DEFINE_string(
 58 |     "init_checkpoint", None,
 59 |     "Initial checkpoint (usually from a pre-trained BERT model).")
 60 | 
 61 | flags.DEFINE_bool(
 62 |     "do_lower_case", True,
 63 |     "Whether to lower case the input text. Should be True for uncased "
 64 |     "models and False for cased models.")
 65 | 
 66 | flags.DEFINE_integer(
 67 |     "max_seq_length", 128,
 68 |     "The maximum total input sequence length after WordPiece tokenization. "
 69 |     "Sequences longer than this will be truncated, and sequences shorter "
 70 |     "than this will be padded.")
 71 | 
 72 | flags.DEFINE_bool("do_train", False, "Whether to run training.")
 73 | 
 74 | flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
 75 | 
 76 | flags.DEFINE_bool(
 77 |     "do_predict", False,
 78 |     "Whether to run the model in inference mode on the test set.")
 79 | 
 80 | flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
 81 | 
 82 | flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
 83 | 
 84 | flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
 85 | 
 86 | flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 87 | 
 88 | flags.DEFINE_float("num_train_epochs", 3.0,
 89 |                    "Total number of training epochs to perform.")
 90 | 
 91 | flags.DEFINE_float(
 92 |     "warmup_proportion", 0.1,
 93 |     "Proportion of training to perform linear learning rate warmup for. "
 94 |     "E.g., 0.1 = 10% of training.")
 95 | 
 96 | flags.DEFINE_integer("save_checkpoints_steps", 1000,
 97 |                      "How often to save the model checkpoint.")
 98 | 
 99 | flags.DEFINE_integer("iterations_per_loop", 1000,
100 |                      "How many steps to make in each estimator call.")
101 | 
102 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
103 | 
104 | tf.flags.DEFINE_string(
105 |     "tpu_name", None,
106 |     "The Cloud TPU to use for training. This should be either the name "
107 |     "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
108 |     "url.")
109 | 
110 | tf.flags.DEFINE_string(
111 |     "tpu_zone", None,
112 |     "[Optional] GCE zone where the Cloud TPU is located in. If not "
113 |     "specified, we will attempt to automatically detect the GCE project from "
114 |     "metadata.")
115 | 
116 | tf.flags.DEFINE_string(
117 |     "gcp_project", None,
118 |     "[Optional] Project name for the Cloud TPU-enabled project. If not "
119 |     "specified, we will attempt to automatically detect the GCE project from "
120 |     "metadata.")
121 | 
122 | tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
123 | 
124 | flags.DEFINE_integer(
125 |     "num_tpu_cores", 8,
126 |     "Only used if `use_tpu` is True. Total number of TPU cores to use.")
127 | 
128 | 
129 | class InputExample(object):
130 |     """A single training/test example for simple sequence classification."""
131 | 
132 |     def __init__(self, guid, text, label=None):
133 |         """Constructs a InputExample.
134 | 
135 |         Args:
136 |           guid: Unique id for the example.
137 |           text: string. The untokenized text of the first sequence. For single
138 |             sequence tasks, only this sequence must be specified.
139 |           label: (Optional) string. The label of the example. This should be
140 |             specified for train and dev examples, but not for test examples.
141 |         """
142 |         self.guid = guid
143 |         self.text = text
144 |         self.label = label
145 | 
146 | 
147 | class InputFeatures(object):
148 |     """A single set of features of data."""
149 | 
150 |     def __init__(self, input_ids, input_mask, segment_ids, label_ids):
151 |         self.input_ids = input_ids
152 |         self.input_mask = input_mask
153 |         self.segment_ids = segment_ids
154 |         self.label_ids = label_ids
155 | 
156 | 
157 | class DataProcessor(object):
158 |     """Base class for data converters for sequence classification data sets."""
159 | 
160 |     def get_train_examples(self, data_dir):
161 |         """Gets a collection of `InputExample`s for the train set."""
162 |         raise NotImplementedError()
163 | 
164 |     def get_dev_examples(self, data_dir):
165 |         """Gets a collection of `InputExample`s for the dev set."""
166 |         raise NotImplementedError()
167 | 
168 |     def get_test_examples(self, data_dir):
169 |         """Gets a collection of `InputExample`s for prediction."""
170 |         raise NotImplementedError()
171 | 
172 |     def get_labels(self):
173 |         """Gets the list of labels for this data set."""
174 |         raise NotImplementedError()
175 | 
176 |     @classmethod
177 |     def _read_tsv(cls, input_file, quotechar=None):
178 |         """BIO"""
179 |         # with tf.gfile.Open(input_file, "r") as f:
180 |         #     reader = csv.reader(f, delimiter=" ", quotechar=quotechar)
181 |         #     lines = []
182 |         #     for line in reader:
183 |         #         lines.append(line)
184 |         #     return lines
185 |         with codecs.open(input_file, "r", encoding="utf-8") as f:
186 |             lines = []
187 |             words = []
188 |             labels = []
189 |             for line in f:
190 |                 content = line.strip()
191 |                 word = line.strip().split("\t")[0]
192 |                 if len(line.strip().split()) > 1:
193 |                     label = line.strip().split("\t")[1]
194 |                 else:
195 |                     label = "O"
196 |                 if len(content) == 0 and words[-1] == "。":
197 |                     l = " ".join([label for label in labels if len(label) > 0])
198 |                     w = " ".join([word for word in words if len(word) > 0])
199 |                     lines.append([w, l])
200 |                     words = []
201 |                     labels = []
202 |                     continue
203 |                 words.append(word)
204 |                 labels.append(label)
205 |             return lines
206 | 
207 | 
208 | class NerProcessor(DataProcessor):
209 |     """processor for kaggle quora"""
210 | 
211 |     def get_train_examples(self, data_dir):
212 |         return self._create_examples(
213 |             self._read_tsv(os.path.join(data_dir, "train.csv")), "train"
214 |         )
215 | 
216 |     def get_dev_examples(self, data_dir):
217 |         return self._create_examples(
218 |             self._read_tsv(os.path.join(data_dir, "dev.csv")), "dev"
219 |         )
220 | 
221 |     def get_test_examples(self, data_dir):
222 |         return self._create_examples(
223 |             self._read_tsv(os.path.join(data_dir, "test.csv")), "test"
224 |         )
225 | 
226 |     def get_labels(self):
227 |         '''
228 |         ner class
229 |         :return:
230 |         '''
231 |         return ["X", 'O', 'B-Disease', 'I-Disease', 'B-Reason', 'I-Reason', "B-Symptom", "I-Symptom", "B-Test",
232 |                 "I-Test",
233 |                 "B-Test_Value", "I-Test_Value", "B-Drug", "I-Drug", "B-Frequency", "I-Frequency", "B-Amount",
234 |                 "I-Amount",
235 |                 "B-Treatment", "I-Treatment", "B-Operation", "I-Operation", "B-Method", "I-Method", "B-SideEff",
236 |                 "I-SideEff", "B-Anatomy", "I-Anatomy", "B-Level", "I-Level", "B-Duration", "I-Duration"]
237 | 
238 |     def _create_examples(self, lines, set_type):
239 |         examples = []
240 |         label_list = []
241 |         for (i, line) in enumerate(lines):
242 |             guid = "%s-%s" % (set_type, i)
243 | 
244 |             text = tokenization.convert_to_unicode(line[0])
245 |             label = tokenization.convert_to_unicode(line[1])
246 |             label_list.append(label)
247 |         examples.append(InputExample(guid=guid, text=text, label=label))
248 | 
249 |         if set_type == "test":
250 |             return examples
251 |         else:
252 |             return examples, label_list
253 | 
254 | 
255 | def convert_single_example(ex_index, example, label_list, max_seq_length,
256 |                            tokenizer):
257 |     """Converts a single `InputExample` into a single `InputFeatures`."""
258 |     label_map = {}
259 |     for (i, label) in enumerate(label_list):
260 |         label_map[label] = i
261 |     text_list = example.text.split(" ")
262 |     labellist = example.label.split(" ")
263 |     tokens = []
264 |     labels = []
265 |     for i, word in enumerate(text_list):
266 |         token = tokenizer.tokenize(word)
267 |         tokens.extend(token)
268 |         label_ = labellist[i]
269 |         for n in range(len(token)):
270 |             if n == 0:
271 |                 labels.append(label_)
272 |             else:
273 |                 labels.append("X")
274 | 
275 |     # Account for [CLS] and [SEP] with "- 2"
276 |     if len(tokens) > max_seq_length - 2:
277 |         tokens = tokens[0:(max_seq_length - 2)]
278 |         labels = labels[0: (max_seq_length - 2)]
279 | 
280 |     # The convention in BERT is:
281 |     # (a) For sequence pairs:
282 |     #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
283 |     #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
284 |     # (b) For single sequences:
285 |     #  tokens:   [CLS] the dog is hairy . [SEP]
286 |     #  type_ids: 0     0   0   0  0     0 0
287 |     #
288 |     # Where "type_ids" are used to indicate whether this is the first
289 |     # sequence or the second sequence. The embedding vectors for `type=0` and
290 |     # `type=1` were learned during pre-training and are added to the wordpiece
291 |     # embedding vector (and position vector). This is not *strictly* necessary
292 |     # since the [SEP] token unambiguously separates the sequences, but it makes
293 |     # it easier for the model to learn the concept of sequences.
294 |     #
295 |     # For classification tasks, the first vector (corresponding to [CLS]) is
296 |     # used as as the "sentence vector". Note that this only makes sense because
297 |     # the entire model is fine-tuned.
298 |     ntokens = []
299 |     segment_ids = []
300 |     label_ids = []
301 |     ntokens.append("[CLS]")
302 |     segment_ids.append(0)
303 |     label_ids.append(0)
304 |     for i, token in enumerate(tokens):
305 |         ntokens.append(token)
306 |         segment_ids.append(0)
307 |         label_ids.append(label_map[labels[i]])
308 |     ntokens.append("[SEP]")
309 |     segment_ids.append(0)
310 |     label_ids.append(0)
311 | 
312 |     input_ids = tokenizer.convert_tokens_to_ids(ntokens)
313 | 
314 |     # The mask has 1 for real tokens and 0 for padding tokens. Only real
315 |     # tokens are attended to.
316 |     input_mask = [1] * len(input_ids)
317 | 
318 |     # Zero-pad up to the sequence length.
319 |     while len(input_ids) < max_seq_length:
320 |         input_ids.append(0)
321 |         input_mask.append(0)
322 |         segment_ids.append(0)
323 |         label_ids.append(0)
324 | 
325 |     assert len(input_ids) == max_seq_length
326 |     assert len(input_mask) == max_seq_length
327 |     assert len(segment_ids) == max_seq_length
328 |     assert len(label_ids) == max_seq_length
329 | 
330 |     if ex_index < 5:
331 |         tf.logging.info("*** Example ***")
332 |         tf.logging.info("guid: %s" % (example.guid))
333 |         tf.logging.info("tokens: %s" % " ".join(
334 |             [tokenization.printable_text(x) for x in tokens]))
335 |         tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
336 |         tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
337 |         tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
338 |         tf.logging.info("label_ids: %s " % " ".join([str(x) for x in label_ids]))
339 | 
340 |     feature = InputFeatures(
341 |         input_ids=input_ids,
342 |         input_mask=input_mask,
343 |         segment_ids=segment_ids,
344 |         label_ids=label_ids)
345 |     return feature
346 | 
347 | 
348 | def file_based_convert_examples_to_features(
349 |         examples, label_list, max_seq_length, tokenizer, output_file):
350 |     """Convert a set of `InputExample`s to a TFRecord file."""
351 | 
352 |     writer = tf.python_io.TFRecordWriter(output_file)
353 | 
354 |     for (ex_index, example) in enumerate(examples):
355 |         if ex_index % 10000 == 0:
356 |             tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
357 | 
358 |         feature = convert_single_example(ex_index, example, label_list,
359 |                                          max_seq_length, tokenizer)
360 | 
361 |         def create_int_feature(values):
362 |             f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
363 |             return f
364 | 
365 |         features = collections.OrderedDict()
366 |         features["input_ids"] = create_int_feature(feature.input_ids)
367 |         features["input_mask"] = create_int_feature(feature.input_mask)
368 |         features["segment_ids"] = create_int_feature(feature.segment_ids)
369 |         features["label_ids"] = create_int_feature(feature.label_ids)
370 | 
371 |         tf_example = tf.train.Example(features=tf.train.Features(feature=features))
372 |         writer.write(tf_example.SerializeToString())
373 | 
374 | 
375 | def file_based_input_fn_builder(input_file, seq_length, is_training,
376 |                                 drop_remainder):
377 |     """Creates an `input_fn` closure to be passed to TPUEstimator."""
378 | 
379 |     name_to_features = {
380 |         "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
381 |         "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
382 |         "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
383 |         "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
384 |     }
385 | 
386 |     def _decode_record(record, name_to_features):
387 |         """Decodes a record to a TensorFlow example."""
388 |         example = tf.parse_single_example(record, name_to_features)
389 | 
390 |         # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
391 |         # So cast all int64 to int32.
392 |         for name in list(example.keys()):
393 |             t = example[name]
394 |             if t.dtype == tf.int64:
395 |                 t = tf.to_int32(t)
396 |             example[name] = t
397 | 
398 |         return example
399 | 
400 |     def input_fn(params):
401 |         """The actual input function."""
402 |         batch_size = params["batch_size"]
403 | 
404 |         # For training, we want a lot of parallel reading and shuffling.
405 |         # For eval, we want no shuffling and parallel reading doesn't matter.
406 |         d = tf.data.TFRecordDataset(input_file)
407 |         if is_training:
408 |             d = d.repeat()
409 |             d = d.shuffle(buffer_size=100)
410 | 
411 |         d = d.apply(
412 |             tf.contrib.data.map_and_batch(
413 |                 lambda record: _decode_record(record, name_to_features),
414 |                 batch_size=batch_size,
415 |                 drop_remainder=drop_remainder))
416 | 
417 |         return d
418 | 
419 |     return input_fn
420 | 
421 | 
422 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
423 |     """Truncates a sequence pair in place to the maximum length."""
424 | 
425 |     # This is a simple heuristic which will always truncate the longer sequence
426 |     # one token at a time. This makes more sense than truncating an equal percent
427 |     # of tokens from each, since if one sequence is very short then each token
428 |     # that's truncated likely contains more information than a longer sequence.
429 |     while True:
430 |         total_length = len(tokens_a) + len(tokens_b)
431 |         if total_length <= max_length:
432 |             break
433 |         if len(tokens_a) > len(tokens_b):
434 |             tokens_a.pop()
435 |         else:
436 |             tokens_b.pop()
437 | 
438 | 
439 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
440 |                  labels, num_labels, use_one_hot_embeddings):
441 |     """Creates a classification model."""
442 |     model = modeling.BertModel(
443 |         config=bert_config,
444 |         is_training=is_training,
445 |         input_ids=input_ids,
446 |         input_mask=input_mask,
447 |         token_type_ids=segment_ids,
448 |         use_one_hot_embeddings=use_one_hot_embeddings)
449 | 
450 |     # In the demo, we are doing a simple classification task on the entire
451 |     # segment.
452 |     #
453 |     # If you want to use the token-level output, use model.get_sequence_output()
454 |     # instead.
455 |     output_layer = model.get_sequence_output()
456 |     # final_hidden_shape = modeling.get_shape_list(output_layer, expected_rank=3)
457 |     # batch_size = final_hidden_shape[0]
458 |     # seq_length = final_hidden_shape[1]
459 |     # hidden_size = final_hidden_shape[2]
460 | 
461 |     hidden_size = output_layer.shape[-1].value
462 | 
463 |     output_weights = tf.get_variable(
464 |         "output_weights", [num_labels, hidden_size],
465 |         initializer=tf.truncated_normal_initializer(stddev=0.02))
466 | 
467 |     output_bias = tf.get_variable(
468 |         "output_bias", [num_labels], initializer=tf.zeros_initializer())
469 | 
470 |     with tf.variable_scope("loss"):
471 |         if is_training:
472 |             # I.e., 0.1 dropout
473 |             output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
474 |         output_layer = tf.reshape(output_layer, [-1, hidden_size])
475 |         logits = tf.matmul(output_layer, output_weights, transpose_b=True)
476 |         logits = tf.nn.bias_add(logits, output_bias)
477 |         # probabilities = tf.nn.softmax(logits, axis=-1)
478 |         logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels])
479 |         log_probs = tf.nn.log_softmax(logits, axis=-1)
480 | 
481 |         one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
482 |         # print(tf.shape(one_hot_labels))
483 |         # Tensor("loss/Shape_2:0", shape=(2,), dtype=int32)
484 | 
485 |         per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
486 |         loss = tf.reduce_mean(per_example_loss)
487 | 
488 |         return (loss, per_example_loss, logits)
489 | 
490 | 
491 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
492 |                      num_train_steps, num_warmup_steps, use_tpu,
493 |                      use_one_hot_embeddings):
494 |     """Returns `model_fn` closure for TPUEstimator."""
495 | 
496 |     def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
497 |         """The `model_fn` for TPUEstimator."""
498 | 
499 |         tf.logging.info("*** Features ***")
500 |         for name in sorted(features.keys()):
501 |             tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
502 | 
503 |         input_ids = features["input_ids"]
504 |         input_mask = features["input_mask"]
505 |         segment_ids = features["segment_ids"]
506 |         label_ids = features["label_ids"]
507 | 
508 |         is_training = (mode == tf.estimator.ModeKeys.TRAIN)
509 | 
510 |         (total_loss, per_example_loss, logits) = create_model(
511 |             bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
512 |             num_labels, use_one_hot_embeddings)
513 | 
514 |         tvars = tf.trainable_variables()
515 | 
516 |         scaffold_fn = None
517 |         if init_checkpoint:
518 |             (assignment_map, initialized_variable_names
519 |              ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
520 |             if use_tpu:
521 | 
522 |                 def tpu_scaffold():
523 |                     tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
524 |                     return tf.train.Scaffold()
525 | 
526 |                 scaffold_fn = tpu_scaffold
527 |             else:
528 |                 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
529 | 
530 |         tf.logging.info("**** Trainable Variables ****")
531 |         for var in tvars:
532 |             init_string = ""
533 |             if var.name in initialized_variable_names:
534 |                 init_string = ", *INIT_FROM_CKPT*"
535 |             tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
536 |                             init_string)
537 | 
538 |         output_spec = None
539 |         if mode == tf.estimator.ModeKeys.TRAIN:
540 | 
541 |             train_op = optimization.create_optimizer(
542 |                 total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
543 | 
544 |             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
545 |                 mode=mode,
546 |                 loss=total_loss,
547 |                 train_op=train_op,
548 |                 scaffold_fn=scaffold_fn)
549 |         elif mode == tf.estimator.ModeKeys.EVAL:
550 | 
551 |             def metric_fn(per_example_loss, label_ids, logits):
552 |                 predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
553 |                 accuracy = tf.metrics.accuracy(label_ids, predictions)
554 |                 # precision =
555 |                 # recall =
556 |                 loss = tf.metrics.mean(per_example_loss)
557 |                 return {
558 |                     "eval_accuracy": accuracy,
559 |                     "eval_loss": loss,
560 |                 }
561 | 
562 |             eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
563 |             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
564 |                 mode=mode,
565 |                 loss=total_loss,
566 |                 eval_metrics=eval_metrics,
567 |                 scaffold_fn=scaffold_fn)
568 |         elif mode == tf.estimator.ModeKeys.PREDICT:
569 |             predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
570 |             predict_output = {'values': predictions}
571 |             export_outputs = {'predictions': tf.estimator.export.PredictOutput(predict_output)}
572 | 
573 |             output_spec = tf.contrib.tpu.TPUEstimatorSpec(
574 |                 mode=mode,
575 |                 predictions=predict_output,
576 |                 export_outputs=export_outputs,
577 |                 scaffold_fn=scaffold_fn)
578 |         else:
579 |             print("error")
580 |         return output_spec
581 | 
582 |     return model_fn
583 | 
584 | 
585 | def get_eval(pred_result, real_labels, label_list, max_seq_length):
586 |     label_map = {}
587 |     for i, label in enumerate(label_list):
588 |         label_map[label] = i
589 |     import itertools
590 |     predictions = list(itertools.islice(pred_result, len(real_labels)))
591 |     pred_labels = []
592 |     real_labels_ = []
593 | 
594 |     for i in range(len(predictions)):
595 |         real = real_labels[i]
596 |         if len(real) > max_seq_length - 1:
597 |             continue
598 |         real_ = [label_map[k] for k in real.split(" ")]
599 |         real_labels_.extend(real_)
600 |         pred = predictions[i]["values"][1: len(real_) + 1]
601 |         pred_labels.extend(pred)
602 |         assert len(real_) == len(pred)
603 |     from sklearn.metrics import classification_report
604 |     print(classification_report(real_labels_, pred_labels))
605 | 
606 | 
607 | def main(_):
608 |     tf.logging.set_verbosity(tf.logging.INFO)
609 | 
610 |     processors = {
611 |         "ner": NerProcessor,
612 |     }
613 | 
614 |     if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
615 |         raise ValueError(
616 |             "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
617 | 
618 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
619 | 
620 |     if FLAGS.max_seq_length > bert_config.max_position_embeddings:
621 |         raise ValueError(
622 |             "Cannot use sequence length %d because the BERT model "
623 |             "was only trained up to sequence length %d" %
624 |             (FLAGS.max_seq_length, bert_config.max_position_embeddings))
625 | 
626 |     tf.gfile.MakeDirs(FLAGS.output_dir)
627 | 
628 |     task_name = FLAGS.task_name.lower()
629 | 
630 |     if task_name not in processors:
631 |         raise ValueError("Task not found: %s" % (task_name))
632 | 
633 |     processor = processors[task_name]()
634 | 
635 |     label_list = processor.get_labels()
636 | 
637 |     tokenizer = tokenization.FullTokenizer(
638 |         vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
639 | 
640 |     tpu_cluster_resolver = None
641 |     if FLAGS.use_tpu and FLAGS.tpu_name:
642 |         tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
643 |             FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
644 | 
645 |     is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
646 |     run_config = tf.contrib.tpu.RunConfig(
647 |         cluster=tpu_cluster_resolver,
648 |         master=FLAGS.master,
649 |         model_dir=FLAGS.output_dir,
650 |         save_checkpoints_steps=FLAGS.save_checkpoints_steps,
651 |         tpu_config=tf.contrib.tpu.TPUConfig(
652 |             iterations_per_loop=FLAGS.iterations_per_loop,
653 |             num_shards=FLAGS.num_tpu_cores,
654 |             per_host_input_for_training=is_per_host))
655 | 
656 |     train_examples = None
657 |     num_train_steps = None
658 |     num_warmup_steps = None
659 |     if FLAGS.do_train:
660 |         train_examples, _ = processor.get_train_examples(FLAGS.data_dir)
661 |         num_train_steps = int(
662 |             len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
663 |         num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
664 | 
665 |     model_fn = model_fn_builder(
666 |         bert_config=bert_config,
667 |         num_labels=len(label_list),
668 |         init_checkpoint=FLAGS.init_checkpoint,
669 |         learning_rate=FLAGS.learning_rate,
670 |         num_train_steps=num_train_steps,
671 |         num_warmup_steps=num_warmup_steps,
672 |         use_tpu=FLAGS.use_tpu,
673 |         use_one_hot_embeddings=FLAGS.use_tpu)
674 | 
675 |     # If TPU is not available, this will fall back to normal Estimator on CPU
676 |     # or GPU.
677 |     estimator = tf.contrib.tpu.TPUEstimator(
678 |         use_tpu=FLAGS.use_tpu,
679 |         model_fn=model_fn,
680 |         config=run_config,
681 |         train_batch_size=FLAGS.train_batch_size,
682 |         eval_batch_size=FLAGS.eval_batch_size,
683 |         predict_batch_size=FLAGS.predict_batch_size)
684 | 
685 |     if FLAGS.do_train:
686 |         train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
687 |         file_based_convert_examples_to_features(
688 |             train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
689 |         tf.logging.info("***** Running training *****")
690 |         tf.logging.info("  Num examples = %d", len(train_examples))
691 |         tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
692 |         tf.logging.info("  Num steps = %d", num_train_steps)
693 |         train_input_fn = file_based_input_fn_builder(
694 |             input_file=train_file,
695 |             seq_length=FLAGS.max_seq_length,
696 |             is_training=True,
697 |             drop_remainder=True)
698 |         estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
699 | 
700 |     if FLAGS.do_eval:
701 |         eval_examples, real_labels = processor.get_dev_examples(FLAGS.data_dir)
702 |         eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
703 |         file_based_convert_examples_to_features(
704 |             eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
705 | 
706 |         tf.logging.info("***** Running evaluation *****")
707 |         tf.logging.info("  Num examples = %d", len(eval_examples))
708 |         tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
709 | 
710 |         # This tells the estimator to run through the entire set.
711 |         eval_steps = None
712 |         # However, if running eval on the TPU, you will need to specify the
713 |         # number of steps.
714 |         if FLAGS.use_tpu:
715 |             # Eval will be slightly WRONG on the TPU because it will truncate
716 |             # the last batch.
717 |             eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
718 | 
719 |         eval_drop_remainder = True if FLAGS.use_tpu else False
720 |         eval_input_fn = file_based_input_fn_builder(
721 |             input_file=eval_file,
722 |             seq_length=FLAGS.max_seq_length,
723 |             is_training=False,
724 |             drop_remainder=eval_drop_remainder)
725 | 
726 |         result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
727 | 
728 |         output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
729 |         with tf.gfile.GFile(output_eval_file, "w") as writer:
730 |             tf.logging.info("***** Eval results *****")
731 |             for key in sorted(result.keys()):
732 |                 tf.logging.info("  %s = %s", key, str(result[key]))
733 |                 writer.write("%s = %s\n" % (key, str(result[key])))
734 |         pred_result = estimator.predict(input_fn=eval_input_fn)
735 |         get_eval(pred_result, real_labels, label_list, FLAGS.max_seq_length)
736 | 
737 |     if FLAGS.do_predict:
738 |         predict_examples = processor.get_test_examples(FLAGS.data_dir)
739 |         predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
740 |         file_based_convert_examples_to_features(predict_examples, label_list,
741 |                                                 FLAGS.max_seq_length, tokenizer,
742 |                                                 predict_file)
743 | 
744 |         tf.logging.info("***** Running prediction*****")
745 |         tf.logging.info("  Num examples = %d", len(predict_examples))
746 |         tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
747 | 
748 |         if FLAGS.use_tpu:
749 |             # Warning: According to tpu_estimator.py Prediction on TPU is an
750 |             # experimental feature and hence not supported here
751 |             raise ValueError("Prediction in TPU not supported")
752 | 
753 |         predict_drop_remainder = True if FLAGS.use_tpu else False
754 |         predict_input_fn = file_based_input_fn_builder(
755 |             input_file=predict_file,
756 |             seq_length=FLAGS.max_seq_length,
757 |             is_training=False,
758 |             drop_remainder=predict_drop_remainder)
759 | 
760 |         result = estimator.predict(input_fn=predict_input_fn)
761 | 
762 |         output_predict_file = os.path.join(FLAGS.output_dir, "test_results.csv")
763 |         with tf.gfile.GFile(output_predict_file, "w") as writer:
764 |             tf.logging.info("***** Predict results *****")
765 |             for prediction in result:
766 |                 output_line = "\t".join(
767 |                     str(class_probability) for class_probability in prediction) + "\n"
768 |                 writer.write(output_line)
769 | 
770 | 
771 | if __name__ == "__main__":
772 |     flags.mark_flag_as_required("data_dir")
773 |     flags.mark_flag_as_required("task_name")
774 |     flags.mark_flag_as_required("vocab_file")
775 |     flags.mark_flag_as_required("bert_config_file")
776 |     flags.mark_flag_as_required("output_dir")
777 |     tf.app.run()
778 | 


--------------------------------------------------------------------------------
/run_predict.sh:
--------------------------------------------------------------------------------
 1 | export BERT_BASE_DIR=/path/bert/uncased_L-12_H-768_A-12
 2 | export QUORA_DIR=/path/kaggle/quora/data
 3 | export TRAINED_CLASSIFIER=/path/quora/data/result
 4 | CUDA_VISIBLE_DEVICES=0 	python3 ./bert/run_classifier.py \
 5 | 	  --task_name=kaggle-quora \
 6 | 	    --do_train=false \
 7 | 	      --do_predict=true \
 8 | 	        --data_dir=$QUORA_DIR/ \
 9 | 		  --vocab_file=$BERT_BASE_DIR/vocab.txt \
10 | 		    --bert_config_file=$BERT_BASE_DIR/bert_config.json \
11 | 		      --init_checkpoint=$TRAINED_CLASSIFIER/model.ckpt \
12 | 		        --max_seq_length=128 \
13 | 			  --output_dir=$QUORA_DIR/result/ 
14 | 


--------------------------------------------------------------------------------
/run_train.sh:
--------------------------------------------------------------------------------
 1 | export BERT_BASE_DIR=/path/bert/uncased_L-12_H-768_A-12
 2 | export QUORA_DIR=/path/kaggle/quora/data
 3 | export CUDA_VISIBLE_DEVICES=1,0
 4 | python3 ./bert/run_classifier.py \
 5 | 	  --task_name=kaggle-quora \
 6 | 	    --do_train=true \
 7 | 	      --do_eval=true \
 8 | 	        --data_dir=$QUORA_DIR/ \
 9 | 		  --vocab_file=$BERT_BASE_DIR/vocab.txt \
10 | 		    --bert_config_file=$BERT_BASE_DIR/bert_config.json \
11 | 		      --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
12 | 		        --max_seq_length=128 \
13 | 			  --train_batch_size=32 \
14 | 			    --eval_batch_size=64 \
15 | 			    --pred_batch_size=8 \
16 | 			    --learning_rate=2e-5 \
17 | 			      --num_train_epochs=3.0 \
18 | 			        --output_dir=$QUORA_DIR/result/ \
19 | 


--------------------------------------------------------------------------------